def tag_tweets(ngrams, tweet_id): tweet = Dictionary() tweet.add("tweet_db_id", tweet_id) prev_is_software = False for i in range(len(ngrams), 0, -1): for word in ngrams[i]: if prev_is_software: if check_version(word): tweet.add("version", word) prev_is_software = False # Look for 'Get x free' # This doesn't always work, eg 'get your free ...' / 'get it free' # TODO: Also look for 'Get x on' etc # Also look for 'Download x now' etc elif re.match(r"^[Gg][Ee][Tt][\w.\s]*[Ff][Rr][Ee][Ee]$", word): software = word.replace(re.findall(re.compile(r"^[Gg][Ee][Tt]"), word)[0], "").strip() software = software.replace(re.findall(re.compile(r"[Ff][Rr][Ee][Ee]$"), word)[0], "").strip() if not sql.isSoftware(software): try: if check_bing(software, bing): # Add newly-found software names to list, add to dictionary at end new_software.add(software, tweet) possible_tags.append(tweet_id) # sql.insertSoftware(software) # This task now done at end except ServerError, e: print e raise IncompleteTaggingError() tweet.add("price", "free") # REQUIRES REFACTORING elif re.match(r"^[Gg][Ee][Tt][\w.\s]*[Nn][Oo][Ww]$", word): software = word.replace(re.findall(re.compile(r"^[Gg][Ee][Tt]"), word)[0], "").strip() software = software.replace(re.findall(re.compile(r"[Nn][Oo][Ww]$"), word)[0], "").strip() if not sql.isSoftware(software): try: if check_bing(software, bing): # Add newly-found software names to list, add to dictionary at end new_software.add(software, tweet) possible_tags.append(tweet_id) except ServerError, e: print e raise IncompleteTaggingError()
def _ngram_tagger(self, ngram, tweet_id): tags = Dictionary() tags.add('tweet_db_id', tweet_id) if self._keyword: keyword = self._keyword if self._sql.isSoftware(keyword): entry = self._sql.getSoftware() tags.add('software_name', keyword) tags.add('software_id', str(entry[0])) elif self._sql.isCompany(keyword): entry = self._sql.getCompany() tags.add('company_name', keyword) tags.add('company_id', str(entry[0])) elif self._sql.isOS(keyword): entry = self._sql.getOS() tags.add('os_name', keyword) tags.add('os_id', str(entry[0])) for tagged_words in ngram: self._tagger(tagged_words, tags) print 'lol' print '2',tags return tags