Python TermDocumentMatrix Exemples, textmining.TermDocumentMatrix Python Exemples

Exemple #1

0

Afficher le fichier

def termdocumentmatrix(train_path, test_path, cnn=True):

    tdm = textmining.TermDocumentMatrix()

    tdm, train_labels = add_doc(tdm, train_path)
    tdm, test_labels = add_doc(tdm, test_path)

    tdm_rows = [x for x in tdm.rows()]

    words = tdm_rows[0]
    word_index_dict = {}

    for i in range(0, len(words)):
        word_index_dict[words[i]] = i

    train_len, tdm_rows = add_senti_score(train_path, tdm_rows,
                                          word_index_dict)
    test_len, tdm_rows = add_senti_score(test_path, tdm_rows, word_index_dict)

    train_tdm = np.asarray(tdm_rows[1:train_len + 1])
    test_tdm = np.asarray(tdm_rows[train_len + 1:train_len + test_len + 1])
    train_labels = np.asarray(train_labels)
    if cnn:
        train_tdm = reshapeX(train_tdm)
        test_tdm = reshapeX(test_tdm)
        train_labels = reshapeY(train_labels)

    test_labels = np.asarray(test_labels)

    return train_labels, train_tdm, test_labels, test_tdm, words

Exemple #2

0

Afficher le fichier

def term_document_matrix():
    num_lines = 0
    for line in fileinput:
        num_lines = num_lines + 1
        reading_file_info = [item.rstrip('\n') for item in fileinput]
        tdm = textmining.TermDocumentMatrix()
        for i in range(0, num_lines):
            tdm.add_doc(reading_file_info[i])
    tdm.write_csv('TermDocumentMatrix.csv', cutoff=1)
    temp = list(tdm.rows(cutoff=1))
    vocab = tuple(temp[0])
    x = np.array(temp[1:])
    mu = random((num_lines, 3))
    fcm = p.FuzzyCMeans(x, mu, 2)
    print fcm.mu

    model = lda.LDA(n_topics=15, n_iter=50, random_state=1)
    model.fit(x)

    topic_word = model.topic_word_
    n_top_words = 10

    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        fileoutput.write('Topic {}: {}\n'.format(i, ' '.join(topic_words)))
    fileoutput.close()

Exemple #3

0

Afficher le fichier

Fichier : TextMiningExample.py Projet : abhishekp106/Python

def termdocumentmatrix_example():
    path = "/Users/franciscojavierarceo/MyPrograms/Python/"
    os.chdir(path)
    # Create some very short sample documents
    doc1 = 'John and Bob are brothers.'
    doc2 = 'John went to the store. The store was closed.'
    doc3 = 'Bob went to the store too.'
    # Initialize class to create term-document matrix
    tdm = textmining.TermDocumentMatrix()
    # Add the documents
    tdm.add_doc(doc1)
    tdm.add_doc(doc2)
    tdm.add_doc(doc3)
    # Write out the matrix to a csv file. Note that setting cutoff=1 means
    # that words which appear in 1 or more documents will be included in
    # the output (i.e. every word will appear in the output). The default
    # for cutoff is 2, since we usually aren't interested in words which
    # appear in a single document. For this example we want to see all
    # words however, hence cutoff=1.
    tdm.write_csv('matrix.csv', cutoff=1)
    print tdm
    # Instead of writing out the matrix you can also access its rows directly.
    # Let's print them to the screen.
    for row in tdm.rows(cutoff=1):
        print row

Exemple #4

0

Afficher le fichier

def termdoc(dicto):
    docs = []
    cuisines = ['cuisine']

    # Add data to the TDM and remove unicode from cuisine names
    for entry in dicto:
        docs.append(dicto[entry])
        cuisines.append(entry.encode('ascii', 'ignore'))

    # Use of textmining library to obtain TDM
    tdm = textmining.TermDocumentMatrix()
    for doc in docs:
        tdm.add_doc(doc)

    matrix_file = 'matrix.csv'

    # Remove 'matrix.csv' if it already exists
    try:
        os.remove(matrix_file)
    except OSError:
        pass

    # Write frequencies of all ingredients in each cuisine
    for row, cuisine in zip(tdm.rows(cutoff=1), cuisines):
        with open(matrix_file, 'ab') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([cuisine] + row)

Exemple #5

0

Afficher le fichier

    def __GetDataSet__(self):
        docTermMatrix = []
        vocab = []
        docIds = []
        tdm = textmining.TermDocumentMatrix()

        for i in range(len(self._docId_comma_documentsText_tuple_list)):
            print i, " extraction.. "
            docId, documentText = self._docId_comma_documentsText_tuple_list[i]
            tdm.add_doc(documentText.lower())
            # docIds.append(docId)

        i = -1
        for row in tdm.rows(cutoff=3):
            row = row[0:18000]
            i += 1
            # First row of 'document-term matrix' is vocabulary
            if i == 0:
                for vocabWord in row:
                    vocab.append(vocabWord)
                continue
            print i, "Loading from tdm matrix.."
            # Remaiing rows are []
            docTermMatrix.append(row)

        print "Converting doc-term matrix to numpy array..."
        return np.array(docTermMatrix), vocab, docIds

Exemple #6

0

Afficher le fichier

Fichier : tfidfanalysis.py Projet : ai4everyone/DataBasic

def cosine_similarity(list_of_file_paths):
    # Create some very short sample documents
    doc_list = [
        filehandler.convert_to_txt(file_path)
        for file_path in list_of_file_paths
    ]
    # Initialize class to create term-document matrix
    tdm = textmining.TermDocumentMatrix(
        tokenizer=simple_tokenize_remove_our_stopwords)
    for doc in doc_list:
        tdm.add_doc(doc)
    results = []
    is_first_row1 = True
    for row1 in tdm.rows(cutoff=1):
        if is_first_row1:
            is_first_row1 = False
            continue
        is_first_row2 = True
        cols = []
        for row2 in tdm.rows(cutoff=1):
            if is_first_row2:
                is_first_row2 = False
                continue
            cols.append(1 - spatial.distance.cosine(row1, row2))
        results.append(cols)
    return results

Exemple #7

0

Afficher le fichier

Fichier : final_featurevector.py Projet : rkrenin/text-analytics

def termdocumentmatrix_example():
    tdm = textmining.TermDocumentMatrix()
    for i in range(0, len(vec_dictionary)):
        tdm.add_doc(vec_dictionary[i][1])

    for row in tdm.rows(cutoff=50):
        D.append(row)

Exemple #8

0

Afficher le fichier

def termdocumentmatrix_example():
    # Create some very short sample documents
    tdm = textmining.TermDocumentMatrix()
    mypath = "./corpus"
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

    documents = open('documents.csv', 'w')
    writer = csv.writer(documents)
    writer.writerow(('document_name', 'content'))
    pattern = r"(cid)+"
    for f in onlyfiles:
        if (f[-4:] == ".txt"):
            doc = open(mypath + "/" + f, 'r')
            txt = doc.read().replace(',', '')
            txt = ' '.join(txt.split())
            re.sub(pattern, "", txt)

            writer.writerow((f, txt))
            tdm.add_doc(txt)
            doc.close
    documents.close()

    # Initialize class to create term-document matrix
    # Add the documents
    #tdm.add_doc(doc1)
    #tdm.add_doc(doc2)
    #tdm.add_doc(doc3)
    # Write out the matrix to a csv file. Note that setting cutoff=1 means
    # that words which appear in 1 or more documents will be included in
    # the output (i.e. every word will appear in the output). The default
    # for cutoff is 2, since we usually aren't interested in words which
    # appear in a single document. For this example we want to see all
    # words however, hence cutoff=1.
    tdm.write_csv('matrix.csv', cutoff=2)

Exemple #9

0

Afficher le fichier

Fichier : documentmatrix_clusters_lda (1).py Projet : amittbansal/Analytics

def term_document_matrix():
    num_lines = 0
    for line in fileinput:
        num_lines = num_lines + 1  #calculate the number of lines a text document
        reading_file_info = [item.rstrip('\n') for item in fileinput]
        tdm = textmining.TermDocumentMatrix(
        )  # creation of the list tdm for document matrix
        for i in range(0, num_lines):
            tdm.add_doc(
                reading_file_info[i]
            )  # Add data to the matrix line by line  tokenize is done by itself
    tdm.write_csv(
        'TermDocumentMatrix.csv', cutoff=1
    )  # csv document term matrix created by TermDocumentMatrix name
    temp = list(
        tdm.rows(cutoff=1))  #temp has all the rows of the document term matrix
    vocab = tuple(temp[0])  # the row which have the each word of the document
    x = np.array(
        temp[1:]
    )  # starting from the second row of a matrix as initial is only the words
    # cluster creation
    mu = random(
        (num_lines, 6)
    )  # generate the random number for cluster according to the number of data inserted in document matrix and 3 is the no of clusters should be created which can change
    fcm = p.FuzzyCMeans(x, mu, 2)  # create the clusters
    num_arra = fcm.mu
    summation = num_arra.sum(
        axis=1
    )  # calculate the sum of each row of the document matrix as a numpy array
    summation_vertical = summation[:,
                                   None]  #make the horizontal sum array to vertical array which is easy for furtur access
    rows = num_arra.shape[
        0]  # give the number of rows of a n-dimension numpy array
    columns = num_arra.shape[
        1]  # give the number of columns of a n-dimension numpy array
    num_arra = num_arra.astype(
        float)  # change the int array to float to store the float value
    for rows_count in range(0, rows):  # run till the number of arrays
        divide_sum = summation_vertical.item(
            rows_count,
            0)  # give the item which divide the element for normailzation
        for i in range(0, columns):  # run till no of columns in a n-d array
            replace_division = num_arra.item(
                rows_count, i) / divide_sum  #normalixe the existing value
            num_arra[
                rows_count,
                i] = replace_division  # replace the new value with exisitng value
    print num_arra  #give cluster array whose sum equal to 1 always
    print num_arra.sum(axis=1)
    # LDA implimentation
    model = lda.LDA(n_topics=2, n_iter=10, random_state=2)
    model.fit(x)

    topic_word = model.topic_word_
    n_top_words = 11

    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        fileoutput.write('Topic {}: {}\n'.format(i, ' '.join(topic_words)))
    fileoutput.close()

Exemple #10

0

Afficher le fichier

Fichier : TextAnneal.py Projet : ravibeta/PythonExamples

def summarize(text):
    # prepare text
    lines = text.split('.')
    clean_lines = [line.strip() for line in lines if line.strip()]
    newtext = '\n'.join(clean_lines)
    tdm = textmining.TermDocumentMatrix()
    tdm.add_doc(newtext)
    for index, row in enumerate(tdm.rows(cutoff=1)):
        if index == 0: words = row
        if index == 1: count = row
    # filter stop words
    text = open('stopwords.txt').read()
    stopwords = textmining.simple_tokenize(text)
    freq = [(w, count[index]) for index, w in enumerate(words)
            if w not in stopwords]
    freq.sort(reverse=True)
    # Concordance
    most_freq_words = freq[:10]
    summary = []
    h = histogram(lines, most_freq_words)
    rowcount = threshold(h)
    summary = [(index, line) for index, line in enumerate(lines)
               if index < rowcount]
    summary.sort()
    ret = [line[1] for line in summary]
    print '.'.join(ret)
    return ret

Exemple #11

0

Afficher le fichier

 def test_tdm_df(self):
     tdm = txm.TermDocumentMatrix()
     for doc in self.doclist:
         tdm.add_doc(doc)
     l = [r for r in tdm.rows(cutoff=1)]
     df = pd.DataFrame(np.array(l[1:]), columns=l[0])
     result = tdm_df(self.doclist, remove_punctuation=False)
     assert_frame_equal(result, df)

Exemple #12

0

Afficher le fichier

Fichier : analyze.py Projet : pilch/economisttext

def createTermDocM(C, name):
    tdm = textmining.TermDocumentMatrix()
    [tdm.add_doc(C[x]) for x in C]
    tdm.write_csv('%s .csv' % name)
    names = [x.encode('utf-8') for x in C]
    f = open("authnames.txt", 'w')
    f.writelines([name + '\n' for name in names])
    f.close()
    return (tdm)

Exemple #13

0

Afficher le fichier

Fichier : generateTDM.py Projet : kelaraj/kaggle_yelp

def main():
    # This file should only include rows of text. Be careful of mid-string
    # linebreaks!
    with open("train_plus_test_reviews.csv", "r") as f:
        tdm = textmining.TermDocumentMatrix()
        for line in f:
            tdm.add_doc(line)
            # Only include words which appear in 2+ documents
            tdm.write_csv('matrix.csv', cutoff=2)

Exemple #14

0

Afficher le fichier

Fichier : clean2matrix.py Projet : kln-courses/ling-evid

def texts2matrix(texts, titles, fname='dtm.csv'):
    M = textmining.TermDocumentMatrix()
    for text in texts:
        M.add_doc(text)
    M.write_csv(fname, cutoff=3)
    tname = fname.split('.')[0] + '_filename.txt'
    f = open(tname, 'w')
    for i in titles:
        f.write("%s\n" % i)
    print "matrix saved as " + fname + " and filenames as " + tname

Exemple #15

0

Afficher le fichier

def termdocumentmatrix_example(inputPath, inputFile, outputPath, outputFile):
    with open(inputPath + inputFile, 'rb') as f:
        tdm = textmining.TermDocumentMatrix()
        count = 1
        for line in f:
            vals = line.split('^')
            try:
                tdm.add_doc(vals[0])
            except IndexError, e:
                print str(count) + "th row data format error"
            count = count + 1

Exemple #16

0

Afficher le fichier

 def create_termdocument_matrix(self, tokenzied=False):
     """
     creates a term document matrix of the frequencies of each term in each document held in self.docs
     :return:
     """
     self.tdm = tm.TermDocumentMatrix()
     for doc in self.docs:
         if not tokenzied:
             self.tdm.add_doc(doc)
         else:
             self.tdm.add_tokenized_doc(doc)

Exemple #17

0

Afficher le fichier

Fichier : frequency.py Projet : SergeyAksenov/NeuroPlatform

def create_frequency_matrix(documents, cutoff=2, path_to_save=None):
    # x = np.array([1, 1, 1, 2, 2, 2, 5, 25, 1, 1])
    # y = np.bincount(x)
    # ii = np.nonzero(y)[0]
    # return zip(ii, y[ii])
    tdm = textmining.TermDocumentMatrix()
    for doc in documents:
        tdm.add_doc(doc)
    if not path_to_save == None:
        tdm.write_csv(path_to_save, cutoff=cutoff)
    return tdm.rows(cutoff=cutoff)

Exemple #18

0

Afficher le fichier

def prepare_data_2(clean_reviews=[]):
    '''prepares reviews by creating the LDA corpus'''
    
    tdm = textmining.TermDocumentMatrix()
    for doc1 in clean_reviews :
        tdm.add_doc(doc1)
    temp = list(tdm.rows(cutoff=2))
    vocab = tuple(temp[0])
    X = np.array(temp[1:])
    
    return X,vocab

Exemple #19

0

Afficher le fichier

Fichier : tfidfanalysis.py Projet : ai4everyone/DataBasic

def most_frequent_terms(*args):
    tdm = textmining.TermDocumentMatrix(simple_tokenize_remove_our_stopwords)
    for doc in args:
        tdm.add_doc(doc)

    freqs = []
    for d in tdm.sparse:
        f = [(freq, name) for (name, freq) in list(d.items())]
        f.sort(reverse=True)
        freqs.append(f)

    return freqs

Exemple #20

0

Afficher le fichier

def tdm_df(doclist,
           stopwords=[],
           remove_punctuation=True,
           remove_digits=True,
           sparse_df=False):
    '''
    Create a term-document matrix from a list of e-mails.

    Uses the TermDocumentMatrix function in the `textmining` module.
    But, pre-processes the documents to remove digits and punctuation,
    and post-processes to remove stopwords, to match the functionality
    of R's `tm` package.

    NB: This is not particularly memory efficient and you can get memory
    errors with an especially long list of documents.

    Returns a (by default, sparse) DataFrame. Each column is a term,
    each row is a document.
    '''

    # Create the TDM from the list of documents.
    tdm = txtm.TermDocumentMatrix()

    for doc in doclist:
        if remove_punctuation == True:
            doc = doc.translate(None, string.punctuation.translate(None, '"'))
        if remove_digits == True:
            doc = doc.translate(None, string.digits)

        tdm.add_doc(doc)

    # Push the TDM data to a list of lists,
    # then make that an ndarray, which then
    # becomes a DataFrame.
    tdm_rows = []
    for row in tdm.rows(cutoff=1):
        tdm_rows.append(row)

    tdm_array = np.array(tdm_rows[1:])
    tdm_terms = tdm_rows[0]
    df = DataFrame(tdm_array, columns=tdm_terms)

    # Remove stopwords from the dataset, manually.
    # TermDocumentMatrix does not do this for us.
    if len(stopwords) > 0:
        for col in df:
            if col in stopwords:
                del df[col]

    if sparse_df == True:
        df.to_sparse(fill_value=0)

    return df

Exemple #21

0

Afficher le fichier

Fichier : __init__.py Projet : joshzyj/rlda

 def __init__(self):
     self.docs = None
     self.X = None
     self.features = None
     self.tdm = textmining.TermDocumentMatrix()
     self.models_list = []
     self.k_list = None
     self.topics_n = None
     self.topic_labels = []
     self.models_matrix = None
     self.cos_X = None
     self.cos_list = []
     self.ftps = []

Exemple #22

0

Afficher le fichier

Fichier : clean_csv.py Projet : roesler-stan/Scrape-News

def count_terms(site_dict):
    terms_matrix = textmining.TermDocumentMatrix()
    for site, text in site_dict.items():
        terms_matrix.add_doc(text)

    terms_df = pd.DataFrame(terms_matrix.rows())
    terms_df.columns = terms_df.iloc[0]
    terms_df = terms_df[1:]
    terms_df.index = site_dict.keys()
    terms_df.index.name = 'site'
    terms_df = terms_df.T
    terms_df.index.name = 'term'
    terms_df = terms_df.reset_index()
    return terms_df

Exemple #23

0

Afficher le fichier

def tdm_df(doclist):
    tdm = textmining.TermDocumentMatrix()
    if len(doclist) > 0:
        for doc in doclist:
            tdm.add_doc(doc)
        tdm_rows, occurrence = [], []
        for rows in tdm.rows():
            tdm_rows.append(rows)

        tdm_array = np.array(tdm_rows[1:])
        tdm_terms = tdm_rows[0]

        df = pd.DataFrame(tdm_array, columns=tdm_terms)
        return df

Exemple #24

0

Afficher le fichier

def preprocess(inputFile,f_name):
        
        # Read the text file
        file = open(inputFile, 'r')
        
        text = file.read()      
        
        text = text.replace('\n',' ')

        #Number of words in the text
        words_count = len(word_tokenize(strip_punctuation(text)))
  
        # split in to sentences and store the sentences in a list
        sentences = tokenize.sent_tokenize(text)
        
        #Original Sentences
        sentences_backup = list(sentences)
        
        
        filtered_sentences = []

        
        # Apply stop word removal to each sentence
        stop_words = set(stopwords.words('english'))
            
        for i in range(len(sentences_backup)):
            temp = []
            word_tokens = word_tokenize(strip_punctuation(sentences_backup[i]))
            for w in word_tokens:
                if w.lower() not in stop_words:
                    temp.append(w.lower())
            filtered_sentences.append(temp)

        tdm = textmining.TermDocumentMatrix()
        for i in range(len(sentences)):
            sent = " ".join(filtered_sentences[i])
            tdm.add_doc(sent)
        
        temp = list(tdm.rows(cutoff=1))
        vocab = tuple(temp[0])
        
        X = np.array(temp[1:],dtype = 'float64')
        X1 = X.transpose()

        
        fileObj = ".\\Pre_Processed\\"+f_name.replace('.txt','')+".csv"
        np.savetxt(fileObj, X1, fmt='%1.5f', delimiter=",")
        vocab1 = tuple(zip(vocab))

Exemple #25

0

Afficher le fichier

Fichier : transform.py Projet : Sapphirine/stackexchange

def termdocumentmatrix_example(xDIR):

    # Initialize class to create term-document matrix
    count = 0
    tdm = textmining.TermDocumentMatrix()
    for i in os.listdir(xDIR):
        Res = tdm.add_doc(open(os.path.join(xDIR, i)).read())

    # Write out the matrix to a csv file. Note that setting cutoff=1 means
    # that words which appear in 1 or more documents will be included in
    # the output (i.e. every word will appear in the output). The default
    # for cutoff is 2, since we usually aren't interested in words which
    # appear in a single document. For this example we want to see all
    # words however, hence cutoff=1.
    tdm.write_csv('/Users/XW/Desktop/datascience.stackexchange.com/answer.csv',
                  cutoff=1)  #输出结果

Exemple #26

0

Afficher le fichier

Fichier : Dataset.py Projet : Sandy4321/lda-visualization-tool

 def fitLDA(self, nTopics, nTopWords):  #Fit LDA model
     topicsList = []
     tdm = textmining.TermDocumentMatrix(
         tokenizer=textmining.simple_tokenize_remove_stopwords)
     for index, row in self.typeData.iterrows():
         if isinstance(row["Title/Description"], basestring):
             tdm.add_doc(row["Title/Description"])
     temp = list(tdm.rows(cutoff=1))
     vocab = tuple(temp[0])
     X = np.array(temp[1:])
     self.model = lda.LDA(n_topics=nTopics, n_iter=500, random_state=1)
     self.model.fit_transform(X)
     topicWord = self.model.topic_word_  # model.components_ also works
     topWords = nTopWords
     for i, topic_dist in enumerate(topicWord):
         topicWords = np.array(vocab)[np.argsort(topic_dist)][:-topWords:-1]
         topicsList.append(topicWords)
     return topicsList

Exemple #27

0

Afficher le fichier

Fichier : make_tdm.py Projet : Sapphirine/stackexchange

def termdocumentmatrix_example(xDIR):
    # Initialize class to create term-document matrix
    count=0
    tdm = textmining.TermDocumentMatrix()
    for i in os.listdir(xDIR):
    	Res = tdm.add_doc(open(os.path.join(xDIR,i)).read()) 


    # Write out the matrix to a csv file. Note that setting cutoff=1 means
    # that words which appear in 1 or more documents will be included in
    # the output (i.e. every word will appear in the output). The default
    # for cutoff is 2, since we usually aren't interested in words which
    # appear in a single document. For this example we want to see all
    # words however, hence cutoff=1.
    tdm.write_csv('/Users/Zhen/Desktop/Courses/BigData/stackexchange/topicModeling/result/matrix.csv',cutoff=1) #输出结果
    # Instead of writing out the matrix you can also access its rows directly.
    # Let's print them to the screen.
    for row in tdm.rows(cutoff=1):
            print row

Exemple #28

0

Afficher le fichier

Fichier : read_data.py Projet : suzhiba/volatility-prediction

def construct_doc_term_matrix(tok_folders, indices):
    '''
    Take a list of path to folders where tok.mda file is stored, 
    and return 
    1. the numpy NumDoc x NumVocab matrix of doctermatrix
    Document ordered by Folder order and then by CUSIP number
    2. number of document in the last folder
    Folders should not end with "/"
    Folders should be ordered by time

    '''

    tm_tdm = textmining.TermDocumentMatrix()
    document_count = 0
    document_last_count = 0
    for i in range(len(tok_folders)):
        tok_folder = tok_folders[i]
        tokfile_list = os.listdir(tok_folder)
        tokfile_list.sort()
        document_last_count = 0
        index = indices[i]
        for j in range(len(tokfile_list)):
            if j not in index:
                continue
            tokfile_name = tokfile_list[j]
            with open(tok_folder + "/" + tokfile_name) as tokfile:
                line = tokfile.readline()
                # in the origianl data, # refers to numbers
                line = re.sub('[#]', 'number', line)
                tm_tdm.add_doc(line)
                document_count += 1
                document_last_count += 1
    np_tdm = 0
    row_index = -1
    vocab = []
    for row in tm_tdm.rows(cutoff=1):
        if row_index < 0:
            np_tdm = np.zeros(shape=(document_count, len(row)))
            vocab = row[:]
        else:
            np_tdm[row_index] = row
        row_index += 1
    return np_tdm, vocab, document_last_count

Exemple #29

0

Afficher le fichier

def tdm_df(doclist,
           stopwords=[],
           remove_punctuation=True,
           remove_digits=True,
           sparse_df=False):
    """
    Create a term-document matrix from a list of e-mails.
    Uses the TermDocumentMatrix function in the `textmining` module.
    But, pre-processes the documents to remove digits and punctuation,
    and post-processes to remove stopwords, to match the functionality
    of R's `tm` package.
    """
    tdm = txtm.TermDocumentMatrix()

    for doc in doclist:
        if remove_punctuation == True:
            translator_pun = str.maketrans('', '', string.punctuation)
            doc = doc.translate(translator_pun)
        if remove_digits == True:
            translator_digt = str.maketrans('', '', string.digits)
            doc = doc.translate(translator_digt)
        tdm.add_doc(doc)

    # Push the TDM data to a list of lists,
    # then make that an ndarray, which then
    # becomes a DataFrame.
    tdm_rows = []
    for row in tdm.rows(cutoff=1):
        tdm_rows.append(row)

    tdm_array = np.array(tdm_rows[1:])
    tdm_terms = tdm_rows[0]
    df = DataFrame(tdm_array, columns=tdm_terms)

    if len(stopwords) > 0:
        for col in df:
            if col in stopwords:
                del df[col]

    if sparse_df == True:
        df.to_sparse(fill_value=0)

    return df

Exemple #30

0

Afficher le fichier

def create_keyword_table():
    with open(os.path.join(get_paths()['working_dir'], 'Paper.csv'),
              'r',
              encoding="utf-8") as paper_file:
        paper = csv.reader(paper_file)
        paper_column = paper.__next__()

        tdm = textmining.TermDocumentMatrix()

        papers = []
        for x in paper:
            papers.append(x)
            tdm.add_doc(
                re.sub(r"[^A-Za-z0-9 _]",
                       " ",
                       ' '.join([str(x[1]), str(x[5])]),
                       flags=re.UNICODE))

        keywords = []
        cutoff = 30
        stopwords = textmining.stopwords
        stopwords.update(['key', 'words', 'keywords', 'keyword', 'word'])
        for (paper_i, tdm_i) in zip(papers, tdm.sparse):
            id_paper = paper_i[0]
            year = paper_i[2]
            id_conference = paper_i[3]
            id_journal = paper_i[4]
            paper_words = [
                [id_paper, year, id_conference, id_journal, word]
                for word in tdm_i.keys()
                if tdm.doc_count[word] >= cutoff and word not in stopwords
            ]
            keywords.extend(paper_words)

        with open(os.path.join(get_paths()['working_dir'], 'keywords.csv'),
                  'w',
                  encoding="utf-8") as keyword_file:
            keyword = csv.writer(keyword_file)
            keyword.writerow(
                ('paperid', 'year', 'conferenceid', 'journalid', 'keyword'))
            for x in keywords:
                keyword.writerow(x)