コード例 #1
0
ファイル: processor.py プロジェクト: nikingale/typewriter
def lemmatize(words):
    roots = list()
    lemmatizer = WordNetLemmatizer()
    for word in words:
        tag = word[1]
        wordnet_tag = get_wordnet_pos(tag)
        inflection = word[0]
        if wordnet_tag == '':
            root = inflection
        else:
            root = lemmatizer.lemmatize(inflection, get_wordnet_pos(tag))
        roots.append(root)
    return roots
コード例 #2
0
    def get_sentiment_from_text_sentiwordnet(self, text):
        tokens = nltk.word_tokenize(text)
        tokens_tagged = nltk.pos_tag(tokens)
        total_sentiment = np.zeros(self.get_lexicon_shape())

        for tagged_pair in tokens_tagged:
            word, pos = tagged_pair[0], get_wordnet_pos(tagged_pair[1])
            if pos != '':
                synset = lesk(tokens, word, pos)
                if synset is not None:
                    try:
                        total_sentiment += self.get_sentiment_from_lexicon(synset.name())
                    except KeyError:
                        pass

        self.logger.debug('Text: ' + text)
        tokens = self.preprocess(text, join_negatives=False)
        self.logger.debug('Tokens: ' + str(tokens))

        return total_sentiment[:2], tokens
コード例 #3
0
    def load_and_clean_data(self):
        all_sentences, named_entities = [], []
        with open(self.file_path) as train_file:
            words, entities, unique_entities = [], [], set()
            for line in islice(train_file, 2, None):
                # read the file from 2nd line
                word = line.split(' ')[0]
                named_entity = line.split(' ')[-1].strip('\n')

                word = self.clean_word(word)

                if line in ('\n', '\r\n'):
                    # end of a sentence
                    all_sentences.append(' '.join(words))
                    named_entities.append(' '.join(entities))
                    unique_entities |= set(entities)
                    words, entities = [], []
                else:
                    if word:
                        # Performing Word Lemmatization on text
                        word_lemmatizer = WordNetLemmatizer()
                        word, typ = nltk.pos_tag(word_tokenize(word))[0]

                        typ = get_wordnet_pos(typ)
                        if typ:
                            lemmatized_word = word_lemmatizer.lemmatize(
                                word, typ)
                        else:
                            lemmatized_word = word_lemmatizer.lemmatize(word)

                        words.append(lemmatized_word)
                        entities.append(named_entity)

        self.n_entities = len(unique_entities)
        self.entity_to_index = {t: i for i, t in enumerate(unique_entities)}
        self.df = pd.DataFrame(
            data={
                DataFrameHeaders.sentence: all_sentences,
                DataFrameHeaders.named_entity: named_entities
            })
コード例 #4
0
print("Total articles: " + str(articles_count))
allwords_set = set()

print('Removing stopwords...')
data = removeStopWords(data)


# Lemmatisation of words
print('Lemmatization...')
lemmatizer = WordNetLemmatizer()
for article in data["articles"]:
    for text_id in article:
            # for every word in every article
        for word in article[str(text_id)]:
            word["word"] = lemmatizer.lemmatize(
                word["word"], utils.get_wordnet_pos(word["pos_tag"]))
            allwords_set.add(word["word"])

all_unique_words_count = len(allwords_set)
print("Total number of unique words: " + str(all_unique_words_count))


# calculate tf for every lemma and add it to set_tf
print('Calculating tf ')
set_tf = {}  # set_tf will contain as key the "article_id:word" ex: "12:start" and as value the tf of this word in the specified article
# in this way, we can use get_tf_of_word_from_article function which will return the tf of a given word for a given article immediately, without any loop.
for article in data["articles"]:
    for text_id in article:
            # for every word in every article
        for word in article[str(text_id)]:
            key = str(text_id)+":"+word["word"]
コード例 #5
0
                  for word in sent]

# this doesn't work, a little salty about it
#examples_words = [word for word_tokenize(sent) in idiom_examples \
#                  for word in word_tokenize(sent)]

# Does it matter that the last sentence of one document will be combined with
# the first sentence of another?
words = brown.words() + gutenberg.words() + reuters.words() + examples_words

print("{}: Lowercasing all the words...".format(datetime.now()))
words_lower = [w.lower() for w in words]

print("{}: Lemmatizing all the words...".format(datetime.now()))
wnlt = nltk.WordNetLemmatizer()
words_lemmatized = [wnlt.lemmatize(word, get_wordnet_pos(tb_pos)) \
         for word,tb_pos in nltk.pos_tag(words_lower)]

#bigrams = nltk.collocations.BigramCollocationFinder.from_words(
#        words,
#        window_size=20)

#bigrams.apply_freq_filter(20)
#bigrams_freq = bigrams.ngram_fd

### TODO(?) : Try different windowsizes
print("{}: Creating bigrams frequencies and storing results...".format(
    datetime.now()))
bigrams_freq = BigramCollocationFinder.from_words(words_lemmatized,
                                                  window_size=20).ngram_fd
コード例 #6
0
jsonObj = loads(dumps(badgerfish.data(fromstring(xmlstr))))

# lowercase the search query
for i in range(0, len(args.words)):
    args.words[i] = args.words[i].lower()

# add tags and remove stopWords
wordsWithTagsAndStopWords = nltk.pos_tag(args.words)
wordsWithTags = utils.removeStopWordsFromListOfWords(wordsWithTagsAndStopWords)

# leammatize words and remove the tag, in order to be ready for query
lemmatizer = WordNetLemmatizer()
words = []
for i in range(0, len(wordsWithTags)):
    words.append(lemmatizer.lemmatize(
        wordsWithTags[i][0], utils.get_wordnet_pos(wordsWithTags[i][1])))

# get the articles containing the words of the query and add weights if articles contain multiple words of the query
docsContainingRequestedWords = getListOfDocsWithWeightsForWords(words)
finalListWithIdsAfterQuery = getSumOfWeightsForArticlesWithSameWords(
    docsContainingRequestedWords)

# display results
if len(finalListWithIdsAfterQuery) > 0:
    conn = sqlite3.connect(str(Path(__file__).parent) +
                           '/database/crawler_db.sqlite')

    cursor = conn.cursor()

    finalListWithIdsAfterQuery.sort(key=lambda x: x["@weight"], reverse=True)
    if(args.limit == None):  # if no limit specified then display all relevant articles