def dissimilarity_41(tree_file, matrix_folder, method_name): # split tree file into individual texts corpus = read_file(tree_file).split("<\\tphyldoc>") nonstemmed_corpus = [] print "os.path(tree_file): ", tree_file for file in corpus: # turn all texts into lowercase file = file.lower() # remove all punctuations file = file.translate(None, string.punctuation) file += "\n" # get new corpus ready to use: nonstemmed_corpus nonstemmed_corpus.append(file) corpus_number = len(nonstemmed_corpus) dis_matrix = [[0 for x in range(corpus_number)] for y in range(corpus_number)] corpus_index_list1 = np.arange(corpus_number) path = os.path.dirname(os.path.abspath(__file__)) path_index = os.path.splitext(os.path.basename(tree_file))[0] nonstemmed_corpus_path = os.path.join( path, "generated_corpus_{}".format(path_index)) if not os.path.exists(nonstemmed_corpus_path): os.makedirs(nonstemmed_corpus_path) for element in nonstemmed_corpus: write_file( nonstemmed_corpus_path, "nonstemmed_{}.txt".format(nonstemmed_corpus.index(element)), element) # calling R scripts to load corpus outputDir = os.path.join(path, "processed_corpus_{}".format(path_index)) outputDir += "/" inputDir = nonstemmed_corpus_path if not os.path.exists(outputDir): os.makedirs(outputDir) # stem the corpus generate processed corpus with open('longworkStemBatchEn.R') as fh: rcode = os.linesep.join(fh.readlines()) myfunc_stem = SignatureTranslatedAnonymousPackage(rcode, "myfunc_stem") myfunc_stem.longworkStemBatchEn(inputDir, outputDir) # load the corpus with open('loadCorpus.R') as lh: rcode = os.linesep.join(lh.readlines()) myfunc_load = SignatureTranslatedAnonymousPackage(rcode, "myfunc_load") r_corpus = myfunc_load.loadCorpus(outputDir) # get two files as sample1 and sample2 to calculate vec using attribution features for file1_id in corpus_index_list1: file1 = nonstemmed_corpus[file1_id] # generate a full list of corpus corpus_index_list2 = np.arange(corpus_number) corpus_index_list2 = corpus_index_list2.tolist() # remove file1's index from the list corpus_index_list2.remove(file1_id) dis_matrix[file1_id][file1_id] = 0 for file2_id in corpus_index_list2: file2 = nonstemmed_corpus[file2_id] # run genFeatures.R with open('genFeatures.R') as gh: importr("gtools") rcode = os.linesep.join(gh.readlines()) myfunc_genf = SignatureTranslatedAnonymousPackage( rcode, "myfunc_genf") # vec is the resulting vector generated by genFeatures.R vec = myfunc_genf.genFeatures(file1, file2, r_corpus, 1) # manipulating vec to generate wanted value file12_array = numpy.array(vec) str12 = ''.join(str(e) for e in file12_array) testlist = str12.split() del testlist[0] newlist = [] for element in testlist: newele = element.split(":", 1)[-1] if newele.find('.'): float(newele) newlist.append(newele) else: int(newele) newlist.append(newele) newarray = np.asarray(newlist, dtype=float) value = np.dot(newarray, np.transpose(newarray)) dis_matrix[file1_id][file2_id] = value dis_matrix[file2_id][file1_id] = value print "dis_matrix", file1_id, file2_id, "=", value # get dissimilarity matrix # abs(): Return the absolute value of a number # dis_matrix = abs( # (tf_idf_matrix * np.transpose(tf_idf_matrix)).todense() - 1) # # dis_matrix[dis_matrix < 1e-10] = 0 # # np.array(): Create an array. # dis_matrix = np.array(dis_matrix) # filename = tree_file.split(os.sep)[-1].replace(".txt", "") + ".dismat" print filename write_matrix(dis_matrix, os.path.join(matrix_folder, method_name), filename) return dis_matrix