def tag_tweets(ngrams, tweet_id):
    tweet = Dictionary()
    tweet.add("tweet_db_id", tweet_id)
    prev_is_software = False
    for i in range(len(ngrams), 0, -1):
        for word in ngrams[i]:
            if prev_is_software:
                if check_version(word):
                    tweet.add("version", word)
                prev_is_software = False
            # Look for 'Get x free'
            # This doesn't always work, eg 'get your free ...' / 'get it free'
            # TODO: Also look for 'Get x on' etc
            # Also look for 'Download x now' etc
            elif re.match(r"^[Gg][Ee][Tt][\w.\s]*[Ff][Rr][Ee][Ee]$", word):
                software = word.replace(re.findall(re.compile(r"^[Gg][Ee][Tt]"), word)[0], "").strip()
                software = software.replace(re.findall(re.compile(r"[Ff][Rr][Ee][Ee]$"), word)[0], "").strip()
                if not sql.isSoftware(software):
                    try:
                        if check_bing(software, bing):
                            # Add newly-found software names to list, add to dictionary at end
                            new_software.add(software, tweet)
                            possible_tags.append(tweet_id)
                            # sql.insertSoftware(software) # This task now done at end
                    except ServerError, e:
                        print e
                        raise IncompleteTaggingError()
                tweet.add("price", "free")

            # REQUIRES REFACTORING
            elif re.match(r"^[Gg][Ee][Tt][\w.\s]*[Nn][Oo][Ww]$", word):
                software = word.replace(re.findall(re.compile(r"^[Gg][Ee][Tt]"), word)[0], "").strip()
                software = software.replace(re.findall(re.compile(r"[Nn][Oo][Ww]$"), word)[0], "").strip()
                if not sql.isSoftware(software):
                    try:
                        if check_bing(software, bing):
                            # Add newly-found software names to list, add to dictionary at end
                            new_software.add(software, tweet)
                            possible_tags.append(tweet_id)
                    except ServerError, e:
                        print e
                        raise IncompleteTaggingError()
Beispiel #2
0
    def _ngram_tagger(self, ngram, tweet_id):
        tags = Dictionary()
        tags.add('tweet_db_id', tweet_id)
        if self._keyword:
            keyword = self._keyword
            if self._sql.isSoftware(keyword):
                entry = self._sql.getSoftware()
                tags.add('software_name', keyword)
                tags.add('software_id', str(entry[0]))
            elif self._sql.isCompany(keyword):
                entry = self._sql.getCompany()
                tags.add('company_name', keyword)
                tags.add('company_id', str(entry[0]))
            elif self._sql.isOS(keyword):
                entry = self._sql.getOS()
                tags.add('os_name', keyword)
                tags.add('os_id', str(entry[0]))

        for tagged_words in ngram:
            self._tagger(tagged_words, tags)
        print 'lol'
        print '2',tags
        return tags