def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
    # remove stopwords
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus
    def produce(self):        

        print('Getting src docs')
        docs = []
        doctokens = [] # aka Gensim's "text"
        stopwords = nltk.corpus.stopwords.words('english')
        for doc in self.src_doc_generator():
            (doc_id,doc_label,doc_str) = doc
            docs.append(doc)
            doctokens.append([token for token in nltk.word_tokenize(doc_str) if token not in stopwords])
            if len(docs) % 1000 == 0: print(len(docs))
                
        print('Creating the dictionary')
        dictionary = Dictionary(doctokens)
        #dictionary.compactify()
        #dictionary.filter_extremes(keep_n=None)
        if self.dictfile:
            dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True)

        with self.dbi as db:

            print('Creating WORD') # aka Gensim's "dictionary"
            db.create_table('word')
            for word_id, word_str in dictionary.iteritems():
                db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',(word_id,word_str))
            
            print('Creating DOC and DOCWORD')
            db.create_table('doc')
            db.create_table('docword')
            for doc_idx, doc in enumerate(docs):
                db.cur.execute('INSERT INTO doc (doc_index,doc_id,doc_label,doc_str ) VALUES (?,?,?,?)',(doc_idx,doc[0],doc[1],doc[2]))
                doc_id = doc[0]
                for word_id, word_count in (dictionary.doc2bow(doctokens[doc_idx])):
                    word_str = dictionary.get(word_id) # Is this valid? I believe it is.
                    db.cur.execute('INSERT INTO docword (doc_index,doc_id,word_id,word_str,word_count) VALUES (?,?,?,?,?)',(doc_idx,doc_id,word_id,word_str,word_count))
def prepare_word_embedding():
    """Construct vocabulary file and word embedding file.
    """
    df = pd.read_csv(
        "data/raw/train.csv", usecols=["original_phrase1", "original_phrase2", "ytrue"]
    )

    model = KeyedVectors.load_word2vec_format(
        "/data/mayu-ot/Data/Model/GoogleNews-vectors-negative300.bin.gz", binary=True
    )

    CUSTOM_FILTERS = [
        lambda x: x.lower(),
        strip_punctuation,
        strip_multiple_whitespaces,
        strip_numeric,
    ]

    doc = [preprocess_string(x, CUSTOM_FILTERS) for x in df.values[:, :2].ravel()]

    dct = Dictionary(doc)

    bad_ids = []
    for k, v in dct.iteritems():
        if v not in model:
            bad_ids.append(k)
    dct.filter_tokens(bad_ids)

    dct.compactify()

    for k, v in dct.iteritems():
        print(k, v)
        if k == 10:
            break

    dct.save_as_text("data/processed/dictionary.txt")

    word_emb = np.ones((len(dct), 300))

    for k, v in dct.iteritems():
        word_emb[k] = model[v]

    np.save("data/processed/word2vec", word_emb)
    def test_dict_interface(self):
        """Test Python 2 dict-like interface in both Python 2 and 3."""
        d = Dictionary(self.texts)

        self.assertTrue(isinstance(d, Mapping))

        self.assertEqual(list(zip(d.keys(), d.values())), list(d.items()))

        # Even in Py3, we want the iter* members.
        self.assertEqual(list(d.items()), list(d.iteritems()))
        self.assertEqual(list(d.keys()), list(d.iterkeys()))
        self.assertEqual(list(d.values()), list(d.itervalues()))
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
    # remove stopwords
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [
        tokenid for tokenid, word in dictionary.iteritems()
        if len(word.split('/')[0]) <= 3
    ]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [
        tokenid for tokenid, word in dictionary.iteritems()
        if len(word.split('/')[0]) <= 3
    ]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [
        tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
        if docfreq == 1
    ]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values
    dictionary.filter_extremes(no_below=no_below,
                               no_above=no_above,
                               keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus
    def test_dict_interface(self):
        """Test Python 2 dict-like interface in both Python 2 and 3."""
        d = Dictionary(self.texts)

        self.assertTrue(isinstance(d, Mapping))

        self.assertEqual(list(zip(d.keys(), d.values())), list(d.items()))

        # Even in Py3, we want the iter* members.
        self.assertEqual(list(d.items()), list(d.iteritems()))
        self.assertEqual(list(d.keys()), list(d.iterkeys()))
        self.assertEqual(list(d.values()), list(d.itervalues()))

        # XXX Do we want list results from the dict members in Py3 too?
        if not PY3:
            self.assertTrue(isinstance(d.items(), list))
            self.assertTrue(isinstance(d.keys(), list))
            self.assertTrue(isinstance(d.values(), list))
Beispiel #7
0
    def produce(self):
        doc_n = 0
        docs = []
        doctokens = [] # AKA gensim "text"
        stopwords = nltk.corpus.stopwords.words('english')

        NOALPHA = re.compile('[^a-z]+')
        def prep_string(my_string,pattern = NOALPHA):
            return re.sub(pattern, ' ', my_string.strip().lower())

        print('Getting src docs')
        for doc in self.src_doc_generator():
            content = re.sub(NOALPHA, ' ', doc) # Do this in the corpus generator?
            docs.append(content)
            doctokens.append([token for token in nltk.word_tokenize(content) if token not in stopwords])
            doc_n += 1
            if doc_n % 1000 == 0: print(doc_n)
                
        print('Creating the dictionary')
        dictionary = Dictionary(doctokens)
        dictionary.compactify()
        dictionary.filter_extremes(keep_n=None)
        if self.dictfile:
            dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True)

        with self.dbi as db:

            print('Creating DOC')
            db.create_table('doc')
            for i, doc in enumerate(docs):
                db.cur.execute('INSERT INTO doc VALUES (?,?)',(i,doc))

            print('Creating WORD')
            db.create_table('word')
            for item in dictionary.iteritems():
                db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',item)

            print('Creating DOCWORD')
            db.create_table('docword')
            for i, tokens in enumerate(doctokens):
                for item in (dictionary.doc2bow(tokens)):
                    db.cur.execute('INSERT INTO docword (doc_id,word_id,word_count) VALUES (?,?,?)',[i,item[0],item[1]])
Beispiel #8
0
        dict2 = Dictionary.load(os.path.join(dirpath, "dic_s.txt"))
    elif storage_env == "remote":
        # todo: construct Dictionary objects from remote file contents
        # texts = [
        # 	['human', 'interface', 'computer']
        # ]
        # my_dict = Dictionary(texts)
        raise NotImplementedError

    return dict1, dict2


if __name__ == "__main__":

    texts = [['human', 'interface', 'computer']]
    my_dict = Dictionary(texts)
    for s in my_dict.iteritems():
        print(s)

    d1, d2 = load_dictionaries()

    print("-----------------------------")
    print("DICTIONARY 1", type(d1), len(d1))
    for s in d1.iteritems():
        print(s)

    print("-----------------------------")
    print("DICTIONARY 2", type(d1), len(d1))
    for s in d2.iteritems():
        print(s)
Beispiel #9
0
def extract_and_save_biterm(fname,
                            embed_size=300,
                            min_count=5,
                            max_percent=0.5,
                            iteration=200):
    '''
    simple preprocessing of biterm

    A biterm is an unordered words pair
    Biterm is drawn from documents not from the whole corpus
    '''

    docs = read_corpus(fname, labeled=False, tokens_only=True)
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character, and remove stop words
    docs = [[
        token for token in doc if len(token) > 1 and token not in STOP_WORDS
    ] for doc in docs]

    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

    bigram = Phrases(docs, min_count=20)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=min_count, no_above=max_percent)
    dictionary.compactify()
    '''encode'''
    docs = [[token for token in doc if token in dictionary.token2id]
            for doc in docs]

    # # Remove docs that contains less than 3 words
    docs = [doc for doc in docs if len(set(doc)) > 1]
    # remove docs that contain less than 2 unique words
    model = gensim.models.Word2Vec(docs,
                                   workers=4,
                                   size=embed_size,
                                   iter=100,
                                   min_count=2)

    docs = [dictionary.doc2idx(doc) for doc in docs]

    biterms = {}
    i = 0
    doc_bitems = []
    for doc in docs:
        d_bi = {}
        doc = sorted(doc)
        for x in range(len(doc) - 1):
            for y in range(x + 1, len(doc)):
                if doc[x] == doc[y]:
                    continue
                biterm = (doc[x], doc[y])
                idx = 0
                if biterm not in biterms:
                    biterms[biterm] = i
                    idx = i
                    i += 1
                else:
                    idx = biterms[biterm]
                if idx in d_bi:
                    d_bi[idx] += 1
                else:
                    d_bi[idx] = 1
        doc_bitems.append(d_bi)
    fname = os.path.basename(fname)
    fname = fname.split('.')[0]
    dirc = os.path.join(os.getcwd(), 'Data', 'unsupervised')
    if not os.path.exists(dirc):
        os.makedirs(dirc)

    embeddings = {}
    for key, token in dictionary.iteritems():
        embeddings[key] = model.wv[token]

    dictionary.save(os.path.join(dirc, fname + '_dic.pkl'))
    biterms = dict([key, val] for val, key in biterms.items())

    with open(os.path.join(dirc, fname + '_bit.pkl'), 'wb') as f:
        pickle.dump(biterms, f)
    with open(os.path.join(dirc, fname + '_doc_bit.pkl'), 'wb') as f:
        pickle.dump(doc_bitems, f)
    with open(os.path.join(dirc, fname + '_emb.pkl'), 'wb') as f:
        pickle.dump(embeddings, f)
    with open(os.path.join(dirc, fname + '_doc.pkl'), 'wb') as f:
        pickle.dump(docs, f)
from gensim.corpora import Dictionary


#
# doc2bow,将document转换成bow(bag of words),(token_id, token_counts)
# 1.documents
texts = [['humans', 'interface', 'computer'], ["hello", "wife", "computer"]]

# 2. create a Dictionary
dct = Dictionary(texts)

# 3. add new documents
dct.add_documents([["cat", "say", "meow", "dog"], ["dog"]])

# 4. watch Dictionary items
for key, value in dct.iteritems():
    print(key, value)

# 5. doc to bag of words
bow = dct.doc2bow(["dog", "computer", "non_existent_word", "computer"])
print(bow)

# 6. filter out extremes
dct.filter_extremes(no_below=1, no_above=0.5, keep_n=3)
for key, value in dct.iteritems():
    print(key, value)

from gensim import corpora, models

tfidf = models.TfidfModel(bow)
corpus_tfidf = tfidf[bow]