def cosine_similarity(list_of_file_paths): # Create some very short sample documents doc_list = [ filehandler.convert_to_txt(file_path) for file_path in list_of_file_paths ] # Initialize class to create term-document matrix tdm = textmining.TermDocumentMatrix( tokenizer=simple_tokenize_remove_our_stopwords) for doc in doc_list: tdm.add_doc(doc) results = [] is_first_row1 = True for row1 in tdm.rows(cutoff=1): if is_first_row1: is_first_row1 = False continue is_first_row2 = True cols = [] for row2 in tdm.rows(cutoff=1): if is_first_row2: is_first_row2 = False continue cols.append(1 - spatial.distance.cosine(row1, row2)) results.append(cols) return results
def tf_idf(list_of_file_paths): ''' Compute and return tf-idf from a list of file paths (sorted by tfidf desc) ''' doc_list = [ filehandler.convert_to_txt(file_path) for file_path in list_of_file_paths ] tf_list = [ term_frequency( doc_to_words(doc) ) for doc in doc_list ] # a list of FreqDist objects idf = inverse_document_frequency(tf_list) tf_idf_list = [ [{'term':term, 'tfidf':frequency*idf[term], 'frequency': frequency} for term, frequency in tf.iteritems()] for tf in tf_list ] tf_idf_list = [ sorted(tf_idf, key=itemgetter('tfidf'), reverse=True) for tf_idf in tf_idf_list ] return tf_idf_list
def tf_idf(list_of_file_paths): ''' Compute and return tf-idf from a list of file paths (sorted by tfidf desc) ''' doc_list = [ filehandler.convert_to_txt(file_path) for file_path in list_of_file_paths ] tf_list = [term_frequency(doc_to_words(doc)) for doc in doc_list] # a list of FreqDist objects idf = inverse_document_frequency(tf_list) tf_idf_list = [[{ 'term': term, 'tfidf': frequency * idf[term], 'frequency': frequency } for term, frequency in tf.iteritems()] for tf in tf_list] tf_idf_list = [ sorted(tf_idf, key=itemgetter('tfidf'), reverse=True) for tf_idf in tf_idf_list ] return tf_idf_list
def cosine_similarity(list_of_file_paths): # Create some very short sample documents doc_list = [ filehandler.convert_to_txt(file_path) for file_path in list_of_file_paths ] # Initialize class to create term-document matrix tdm = textmining.TermDocumentMatrix(tokenizer=simple_tokenize_remove_our_stopwords) for doc in doc_list: tdm.add_doc(doc) results = [] is_first_row1 = True for row1 in tdm.rows(cutoff=1): if is_first_row1: is_first_row1 = False continue is_first_row2 = True cols = [] for row2 in tdm.rows(cutoff=1): if is_first_row2: is_first_row2 = False continue cols.append( 1 - spatial.distance.cosine(row1,row2) ) results.append(cols) return results