Ejemplo n.º 1
0
def cosine_similarity(list_of_file_paths):
    # Create some very short sample documents
    doc_list = [
        filehandler.convert_to_txt(file_path)
        for file_path in list_of_file_paths
    ]
    # Initialize class to create term-document matrix
    tdm = textmining.TermDocumentMatrix(
        tokenizer=simple_tokenize_remove_our_stopwords)
    for doc in doc_list:
        tdm.add_doc(doc)
    results = []
    is_first_row1 = True
    for row1 in tdm.rows(cutoff=1):
        if is_first_row1:
            is_first_row1 = False
            continue
        is_first_row2 = True
        cols = []
        for row2 in tdm.rows(cutoff=1):
            if is_first_row2:
                is_first_row2 = False
                continue
            cols.append(1 - spatial.distance.cosine(row1, row2))
        results.append(cols)
    return results
Ejemplo n.º 2
0
def tf_idf(list_of_file_paths):
    '''
    Compute and return tf-idf from a list of file paths (sorted by tfidf desc)
    '''
    doc_list = [ filehandler.convert_to_txt(file_path) for file_path in list_of_file_paths ]
    tf_list = [ term_frequency( doc_to_words(doc) ) for doc in doc_list ]   # a list of FreqDist objects
    idf = inverse_document_frequency(tf_list)
    tf_idf_list = [ [{'term':term, 'tfidf':frequency*idf[term], 'frequency': frequency} for term, frequency in tf.iteritems()] for tf in tf_list ]
    tf_idf_list = [ sorted(tf_idf, key=itemgetter('tfidf'), reverse=True)  for tf_idf in tf_idf_list ]
    return tf_idf_list
Ejemplo n.º 3
0
def tf_idf(list_of_file_paths):
    '''
    Compute and return tf-idf from a list of file paths (sorted by tfidf desc)
    '''
    doc_list = [
        filehandler.convert_to_txt(file_path)
        for file_path in list_of_file_paths
    ]
    tf_list = [term_frequency(doc_to_words(doc))
               for doc in doc_list]  # a list of FreqDist objects
    idf = inverse_document_frequency(tf_list)
    tf_idf_list = [[{
        'term': term,
        'tfidf': frequency * idf[term],
        'frequency': frequency
    } for term, frequency in tf.iteritems()] for tf in tf_list]
    tf_idf_list = [
        sorted(tf_idf, key=itemgetter('tfidf'), reverse=True)
        for tf_idf in tf_idf_list
    ]
    return tf_idf_list
Ejemplo n.º 4
0
def cosine_similarity(list_of_file_paths):
    # Create some very short sample documents
    doc_list = [ filehandler.convert_to_txt(file_path) for file_path in list_of_file_paths ]
    # Initialize class to create term-document matrix
    tdm = textmining.TermDocumentMatrix(tokenizer=simple_tokenize_remove_our_stopwords)
    for doc in doc_list:
        tdm.add_doc(doc)
    results = []
    is_first_row1 = True
    for row1 in tdm.rows(cutoff=1):
        if is_first_row1:
            is_first_row1 = False
            continue
        is_first_row2 = True
        cols = []
        for row2 in tdm.rows(cutoff=1):
            if is_first_row2:
                is_first_row2 = False
                continue
            cols.append( 1 - spatial.distance.cosine(row1,row2) )
        results.append(cols)
    return results