Exemple #1
0
def create_corpus(project, repos, Kind, use_level=True, forced_ref=None):
    corpus_fname_base = project.full_path + Kind.__name__

    if use_level:
        corpus_fname_base += project.level

    if forced_ref:
        corpus_fname_base += forced_ref[:8]

    corpus_fname = corpus_fname_base + '.mallet.gz'
    dict_fname = corpus_fname_base + '.dict.gz'
    made_one = False

    if not os.path.exists(corpus_fname):
        combiner = CorpusCombiner()

        for repo in repos:
            try:
                if repo or forced_ref:
                    corpus = Kind(project=project,
                                  repo=repo,
                                  lazy_dict=True,
                                  ref=forced_ref,
                                  )
                else:
                    corpus = Kind(project=project, lazy_dict=True)

            except KeyError:
                continue
            except TaserError as e:
                if repo == repos[-1] and not made_one:
                    raise e
                    # basically, if we are at the last repo and we STILL
                    # haven't sucessfully extracted a corpus, ring some bells
                else:
                    # otherwise, keep trying. winners never quit.
                    continue

            combiner.add(corpus)
            made_one = True

        # write the corpus and dictionary to disk. this will take awhile.
        combiner.metadata = True
        MalletCorpus.serialize(corpus_fname, combiner, id2word=combiner.id2word,
                               metadata=True)
        combiner.metadata = False

        # write out the dictionary
        combiner.id2word.save(dict_fname)

    # re-open the compressed versions of the dictionary and corpus
    id2word = None
    if os.path.exists(dict_fname):
        id2word = Dictionary.load(dict_fname)

    corpus = MalletCorpus(corpus_fname, id2word=id2word)

    return corpus
def create_corpus(config, Kind):
    corpus_fname = config.corpus_fname % Kind.__name__

    if not os.path.exists(corpus_fname):
        corpus = Kind(config.repo, config.project.commit, lazy_dict=True)
        corpus.metadata = True
        MalletCorpus.serialize(corpus_fname, corpus,
                               id2word=corpus.id2word, metadata=True)
        corpus.metadata = False
        corpus.id2word.save(corpus_fname + '.dict')
Exemple #3
0
def create_queries(project):
    corpus_fname_base = project.full_path + 'Queries'
    corpus_fname = corpus_fname_base + '.ordered.gz'
    dict_fname = corpus_fname_base + '.dict.gz'

    if not os.path.exists(corpus_fname):
        pp = GeneralCorpus(lazy_dict=True)
        id2word = Dictionary()

        with open(os.path.join(project.full_path, 'ids.txt')) as f:
            ids = [x.strip() for x in f.readlines()]

        queries = list()
        for id in ids:
            with open(
                    os.path.join(project.full_path, 'queries',
                                 'ShortDescription' + id + '.txt')) as f:
                short = f.read()

            with open(
                    os.path.join(project.full_path, 'queries',
                                 'LongDescription' + id + '.txt')) as f:
                long = f.read()

            text = ' '.join([short, long])
            text = pp.preprocess(text)

            # this step will remove any words not found in the dictionary
            bow = id2word.doc2bow(text, allow_update=True)

            queries.append((bow, (id, 'query')))

        # write the corpus and dictionary to disk. this will take awhile.
        MalletCorpus.serialize(corpus_fname,
                               queries,
                               id2word=id2word,
                               metadata=True)

    # re-open the compressed versions of the dictionary and corpus
    id2word = None
    if os.path.exists(dict_fname):
        id2word = Dictionary.load(dict_fname)

    corpus = MalletCorpus(corpus_fname, id2word=id2word)

    return corpus
def create_queries(project):
    corpus_fname_base = project.full_path + 'Queries'
    corpus_fname = corpus_fname_base + '.mallet.gz'
    dict_fname = corpus_fname_base + '.dict.gz'

    if not os.path.exists(corpus_fname):
        pp = GeneralCorpus(lazy_dict=True)
        id2word = Dictionary()

        with open(os.path.join(project.full_path, 'ids.txt')) as f:
            ids = [x.strip() for x in f.readlines()]

        queries = list()
        for id in ids:
            with open(os.path.join(project.full_path, 'queries',
                                    'ShortDescription' + id + '.txt')) as f:
                short = f.read()

            with open(os.path.join(project.full_path, 'queries',
                                    'LongDescription' + id + '.txt')) as f:
                long = f.read()

            text = ' '.join([short, long])
            text = pp.preprocess(text)

            # this step will remove any words not found in the dictionary
            bow = id2word.doc2bow(text, allow_update=True)

            queries.append((bow, (id, 'query')))

        # write the corpus and dictionary to disk. this will take awhile.
        MalletCorpus.serialize(corpus_fname, queries, id2word=id2word,
                               metadata=True)

    # re-open the compressed versions of the dictionary and corpus
    id2word = None
    if os.path.exists(dict_fname):
        id2word = Dictionary.load(dict_fname)

    corpus = MalletCorpus(corpus_fname, id2word=id2word)

    return corpus
    def test_mallet_corpus(self):
        with open(self.tempfname, 'w') as f:
            f.write('abc en fred flintstone\n')
            f.write('efg en barney rubble\n')
        corpus3 = MalletCorpus(self.tempfname)
        self.assertEqual(len(corpus3), 2)
        self.corpus.add(corpus3)
        self.assertEqual(len(self.corpus), 10)

        documents = [
            # corpus1
            ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')),
            ([u'graph', u'minors', u'a',
              u'survey'], ('unix.txt', u'test_git')),

            # corpus2
            ([u'graph', u'minors', u'a', u'survey'], ('dos.txt', u'test_git')),
            ([u'graph', u'minors', u'a', u'survey'], ('mac.txt', u'test_git')),
            ([u'graph', u'minors', u'a',
              u'survey'], ('unix.txt', u'test_git')),
            ([
                u'human', u'machine', u'interface', u'for', u'lab', u'abc',
                u'computer', u'applications'
            ], ('a/0.txt', u'test_git')),
            ([
                u'a', u'survey', u'of', u'user', u'opinion', u'of',
                u'computer', u'system', u'response', u'time'
            ], ('a/1.txt', u'test_git')),

            # mallet
            ([u'fred', u'flintstone'], ('abc', u'en')),
            ([u'barney', u'rubble'], ('efg', u'en')),
        ]

        documents = [(set(x), y) for x, y in documents]

        self.corpus.metadata = True
        vals = [
            self.corpus.metadata, self.corpus._metadata,
            self.corpus.corpora[0].metadata, self.corpus.corpora[1].metadata,
            self.corpus.corpora[2].metadata
        ]
        self.assertTrue(all(vals))

        for docmeta in self.corpus:
            doc, meta = docmeta
            self.assertGreater(len(doc), 0)

            # convert the document to text freq since we don't know the
            # term ids ahead of time for testing.
            textdoc = set(unicode(self.corpus.id2word[x[0]]) for x in doc)
            docmeta = textdoc, meta
            self.assertIn(docmeta, documents)
Exemple #6
0
def create_corpus(project, repos, Kind, use_level=True, forced_ref=None):
    names = [Kind.__name__]
    args = {
        'project': project,
        'lazy_dict': True,
    }

    if use_level:
        names.append(project.level)

    if Kind is ChangesetCorpus:
        names.append(project.changeset_config_string)
        args.update(project.changeset_config)

    if forced_ref:
        names.append(forced_ref[:8])

    corpus_fname_base = project.full_path + '-'.join(names)

    corpus_fname = corpus_fname_base + '.mallet.gz'
    dict_fname = corpus_fname_base + '.dict.gz'
    made_one = False

    if not os.path.exists(corpus_fname):
        combiner = CorpusCombiner()

        for repo in repos:
            try:
                if repo or forced_ref:
                    args.update({
                        'repo': repo,
                        'ref': forced_ref,
                    })
                corpus = Kind(**args)

            except KeyError:
                continue
            except TaserError as e:
                if repo == repos[-1] and not made_one:
                    raise e
                    # basically, if we are at the last repo and we STILL
                    # haven't sucessfully extracted a corpus, ring some bells
                else:
                    # otherwise, keep trying. winners never quit.
                    continue

            combiner.add(corpus)
            made_one = True

        # write the corpus and dictionary to disk. this will take awhile.
        combiner.metadata = True
        MalletCorpus.serialize(corpus_fname, combiner, id2word=combiner.id2word,
                               metadata=True)
        combiner.metadata = False

        # write out the dictionary
        combiner.id2word.save(dict_fname)

    # re-open the compressed versions of the dictionary and corpus
    id2word = None
    if os.path.exists(dict_fname):
        id2word = Dictionary.load(dict_fname)

    corpus = MalletCorpus(corpus_fname, id2word=id2word)

    return corpus
def handle_files(json_file):
# ### Author-Doc List

# In[24]:
content = json_file

# Get all author names and their corresponding document IDs.
author2doc = dict()

i = 0
for entry in content:
    sender = entry['Sender'].replace('\n',' ')
    if not author2doc.get(sender):
        # This is a new author.
        #author2doc[sender] = []
        author2doc[sender] = [i]
    # Add document IDs to author.
    else:
        author2doc[sender].append(i)
    i = i + 1
    
i = 0    
for entry in content:
    receiver = entry['Receiver'].replace('\n',' ')
    if not author2doc.get(receiver):
        # This is a new author.
        author2doc[receiver] = []
        author2doc[receiver] = [i]
    # Add document IDs to author.
    else:
        author2doc[receiver].append(i)
    i = i + 1


# ### Clean text data

# In[25]:


nlp = spacy.load('en')

### using both title and abstract
abstract = []
for entry in content:
    title = entry['Title'].replace('\n',' ')
    title = title.replace('/u',' ')
    #sender = entry['Sender'].replace('\n',' ')
    #receiver = entry['Receiver'].replace('\n',' ')
    abst = entry['Content'].replace('\n',' ')
    abst = abst.replace('/u',' ')
    abst = abst.replace('%',' ')
    entry_str = title+' '+abst
    entry_str = re.sub(r'\b\w{1,3}\b', '',entry_str)
    abstract.append(entry_str)


# In[26]:


### Load stopwords

d = {}
stopword = stopwords.words('english')


# In[27]:


### lemmatization, bigrams
#%%time
processed_docs = []    
for doc in nlp.pipe(abstract, n_threads=4, batch_size=100):
    # Process document using Spacy NLP pipeline.
    
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list.
    doc = [token for token in doc if token not in stopword]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_docs.append(doc)

abstract_all = processed_docs
del processed_docs

from gensim.models import Phrases
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(abstract_all, min_count=20)
for idx in range(len(abstract_all)):
    for token in bigram[abstract_all[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            abstract_all[idx].append(token)



dictionary = Dictionary(abstract_all)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.2
min_wordcount = 80
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.



Total = []
for c in content:
    ##using both title and content
    total = c['Title']
    Total.append(total)
    
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(entry).split() for entry in Total]



def bestModel(abstract_all):
    co_score_tfidf = []
    co_score_lda = []
    co_score_mallet = []

    for i in range(0,10):
        random.shuffle(abstract_all)

        training = abstract_all[:round(len(abstract_all)*0.6)]
        test = abstract_all[round(len(abstract_all)*0.6):]

        doc_clean_train = [entry for entry in training]
        doc_clean_test = [entry for entry in test]
        # Creating the term dictionary of our courpus, where every unique term is assigned an index. 
        dictionary_tr = corpora.Dictionary(doc_clean_train)
        dictionary_te = corpora.Dictionary(doc_clean_test)
        dictionary = corpora.Dictionary(abstract_all)
        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        doc_term_matrix_te = [dictionary_te.doc2bow(doc) for doc in doc_clean_test]
        doc_term_matrix_tr = [dictionary_tr.doc2bow(doc) for doc in doc_clean_train]
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in abstract_all]

        #mystring = mystring..decode(‘utf-8’)

        tfidf = models.TfidfModel(doc_term_matrix)
        corpus_tfidf = tfidf[doc_term_matrix_tr]
        corpus_tfidf_te = tfidf[doc_term_matrix_te]

        lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary_tr, passes=2, workers=4)

        Lda = gensim.models.ldamodel.LdaModel
        ldamodel = Lda(doc_term_matrix_tr, num_topics=5, id2word = dictionary_tr, passes=50)
        ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=doc_term_matrix_tr, num_topics=20, id2word=dictionary_tr)
    
        #tfidf
        coherence_model_ldatfidf = CoherenceModel(model=lda_model_tfidf, texts=doc_clean_test, dictionary=dictionary_te, coherence='c_v')
        coherence_ldatfidf = coherence_model_ldatfidf.get_coherence()
    
        co_score_tfidf.append(coherence_ldatfidf)
    
        #lda
        coherence_model = CoherenceModel(model=ldamodel, texts=doc_clean_test, dictionary=dictionary_te, coherence='c_v')
        coherence_lda = coherence_model.get_coherence()
    
        co_score_lda.append(coherence_lda)
    
        #mallet
        coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=doc_clean_test, dictionary=dictionary_te, coherence='c_v')
        coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    
        co_score_mallet.append(coherence_ldamallet)
    
      
    
    avg_co_lda = sum(co_score_lda)/10
    avg_co_tfidf = sum(co_score_lda)/10
    avg_co_mallet = sum(co_score_mallet)/10

    result = {avg_co_lda:'lda',avg_co_tfidf:'tfidf',avg_co_mallet:'mallet'}
    maximum = max([avg_co_lda,avg_co_tfidf,avg_co_mallet])
    best = result[maximum]
    
    return best
    
    
    


# In[31]:


def compute_coherence_values(total, best, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    random.shuffle(total)

    training = total[:round(len(total)*0.6)]
    test = total[round(len(total)*0.6):]

    doc_clean_train = [clean(entry).split() for entry in training]
    doc_clean_test = [clean(entry).split() for entry in test]
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. 
    dictionary_tr = corpora.Dictionary(doc_clean_train)
    dictionary_te = corpora.Dictionary(doc_clean_test)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix_te = [dictionary_te.doc2bow(doc) for doc in doc_clean_test]
    doc_term_matrix_tr = [dictionary_te.doc2bow(doc) for doc in doc_clean_train]
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
     
    coherence_values = []
    model_list = []
    if(best == 'lda'):
        for n in range(start, limit, step):
            Lda = gensim.models.ldamodel.LdaModel
            ldamodel = Lda(doc_term_matrix_tr, num_topics=n, id2word = dictionary_tr, passes=50)
            coherence_model = CoherenceModel(model=ldamodel, texts=doc_clean_test, dictionary=dictionary_te, coherence='c_v')
            coherence_lda = coherence_model.get_coherence()
    
            coherence_values.append(coherence_lda)
         
    if(best == 'tfidf'):
        for n in range(start, limit, step):
            tfidf = models.TfidfModel(doc_term_matrix)
            corpus_tfidf = tfidf[doc_term_matrix_tr]
            corpus_tfidf_te = tfidf[doc_term_matrix_te]

            lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=n, id2word=dictionary_tr, passes=2, workers=4)
            coherence_model_ldatfidf = CoherenceModel(model=lda_model_tfidf, texts=doc_clean_test, dictionary=dictionary_te, coherence='c_v')
            coherence_ldatfidf = coherence_model_ldatfidf.get_coherence()
    
            coherence_values.append(coherence_ldatfidf)
   
    if(best == 'mallet'):
        for n in range(start, limit, step):
            ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=doc_term_matrix_tr, num_topics=n, id2word=dictionary_tr)
            coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=doc_clean_test, dictionary=dictionary_te, coherence='c_v')
            coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    
            coherence_values.append(coherence_ldamallet)
            

    return coherence_values





# In[32]:


def getOptimal(start,limit,step,coherence):
    x = range(start, limit, step)
    xlist = []
    for i, cv in zip(x,coherence):
        #print("Num Topics =", i, " has Coherence Value of", round(cv, 4))
        xlist.append(i)
    
    optimal = []
    last_x = start
    last_y = coherence[0]
    #last_slope = 1
    for i,cv in enumerate(coherence):
        #print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
        last_slope = (cv-last_y)/step
        if i < len(coherence) - 1:
            next_y = coherence[i + 1]
            next_slope = (next_y-cv)/step
            if next_slope <= last_slope and next_slope >= 0  and i > 0:
                optimal.append((xlist[i]))
        else:
            break
        last_y = cv
        #last_x = i
    return min(optimal)

best = bestModel(abstract_all)
coherence = compute_coherence_values(Total, best, limit=40, start=2, step=6)
optimal_topics = getOptimal(2,40,6,coherence)





dictionary = corpora.Dictionary(abstract_all)


### AT Corpus
atcorpus = [dictionary.doc2bow(doc) for doc in abstract_all]

### LDA Mallet Corpus
from gensim.test.utils import datapath, get_tmpfile, common_texts
from gensim.corpora import MalletCorpus
from gensim.corpora import Dictionary

# Write corpus in Mallet format to disk
output_fname = get_tmpfile("corpus.mallet")
MalletCorpus.serialize(output_fname, atcorpus, dictionary)

mallet_corpus = MalletCorpus(output_fname)

malcorpus = list()

for t in mallet_corpus:
    malcorpus.append(t)

### LDA-tfidf Corpus
from operator import itemgetter
import gensim
from gensim import corpora,models
tfidf = models.TfidfModel(atcorpus)
corpus_tfidf = tfidf[atcorpus]

l = list()
for t in corpus_tfidf:
    l.append(t)

index = 0
tfidfcorpus = []
for i in l:
    index +=1
    common_denom = min(i,key=itemgetter(1))[1] if i else None
    if common_denom is not None:
        new_list = []
        for f in i:
            n = f[1]/common_denom
            new_list.append((f[0],int(n)))
        tfidfcorpus.append(new_list)
    else:
        #print(index)
        new_list = []
        for f in i:
            new_list.append(f[0],f[1])
        tfidfcorpus.append(new_list)




def showTopics(model, num):
    topics = []
    i = 1
    for topic in model.show_topics(num_topics=num):
        words = []
        for word, prob in model.show_topic(topic[0]):
            words.append(word)
        print('Topic '+str(i)+': ')
        print(words[2]+' '+words[1]+' '+words[0])
        print(*words)
        print()
        i += 1
        topics.append(words[2]+' '+words[1]+' '+words[0])
    return topics