Ejemplo n.º 1
0
def dissimilarity_41(tree_file, matrix_folder, method_name):
    # split tree file into individual texts
    corpus = read_file(tree_file).split("<\\tphyldoc>")

    nonstemmed_corpus = []

    print "os.path(tree_file): ", tree_file

    for file in corpus:

        # turn all texts into lowercase
        file = file.lower()

        # remove all punctuations
        file = file.translate(None, string.punctuation)
        file += "\n"
        # get new corpus ready to use: nonstemmed_corpus
        nonstemmed_corpus.append(file)

    corpus_number = len(nonstemmed_corpus)
    dis_matrix = [[0 for x in range(corpus_number)]
                  for y in range(corpus_number)]
    corpus_index_list1 = np.arange(corpus_number)

    path = os.path.dirname(os.path.abspath(__file__))
    path_index = os.path.splitext(os.path.basename(tree_file))[0]
    nonstemmed_corpus_path = os.path.join(
        path, "generated_corpus_{}".format(path_index))

    if not os.path.exists(nonstemmed_corpus_path):
        os.makedirs(nonstemmed_corpus_path)

    for element in nonstemmed_corpus:
        write_file(
            nonstemmed_corpus_path,
            "nonstemmed_{}.txt".format(nonstemmed_corpus.index(element)),
            element)

    # calling R scripts to load corpus
    outputDir = os.path.join(path, "processed_corpus_{}".format(path_index))
    outputDir += "/"
    inputDir = nonstemmed_corpus_path

    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    # stem the corpus generate processed corpus
    with open('longworkStemBatchEn.R') as fh:
        rcode = os.linesep.join(fh.readlines())
        myfunc_stem = SignatureTranslatedAnonymousPackage(rcode, "myfunc_stem")
        myfunc_stem.longworkStemBatchEn(inputDir, outputDir)

    # load the corpus
    with open('loadCorpus.R') as lh:
        rcode = os.linesep.join(lh.readlines())
        myfunc_load = SignatureTranslatedAnonymousPackage(rcode, "myfunc_load")
        r_corpus = myfunc_load.loadCorpus(outputDir)

    # get two files as sample1 and sample2 to calculate vec using attribution features
    for file1_id in corpus_index_list1:
        file1 = nonstemmed_corpus[file1_id]
        # generate a full list of corpus
        corpus_index_list2 = np.arange(corpus_number)
        corpus_index_list2 = corpus_index_list2.tolist()
        # remove file1's index from the list
        corpus_index_list2.remove(file1_id)
        dis_matrix[file1_id][file1_id] = 0
        for file2_id in corpus_index_list2:
            file2 = nonstemmed_corpus[file2_id]
            # run genFeatures.R
            with open('genFeatures.R') as gh:
                importr("gtools")
                rcode = os.linesep.join(gh.readlines())
                myfunc_genf = SignatureTranslatedAnonymousPackage(
                    rcode, "myfunc_genf")

                # vec is the resulting vector generated by genFeatures.R
                vec = myfunc_genf.genFeatures(file1, file2, r_corpus, 1)

                # manipulating vec to generate wanted value
                file12_array = numpy.array(vec)
                str12 = ''.join(str(e) for e in file12_array)
                testlist = str12.split()
                del testlist[0]
                newlist = []
                for element in testlist:
                    newele = element.split(":", 1)[-1]
                    if newele.find('.'):
                        float(newele)
                        newlist.append(newele)
                    else:
                        int(newele)
                        newlist.append(newele)

                newarray = np.asarray(newlist, dtype=float)
                value = np.dot(newarray, np.transpose(newarray))
                dis_matrix[file1_id][file2_id] = value
                dis_matrix[file2_id][file1_id] = value
                print "dis_matrix", file1_id, file2_id, "=", value

    # get dissimilarity matrix
    # abs(): Return the absolute value of a number
    # dis_matrix = abs(
    #     (tf_idf_matrix * np.transpose(tf_idf_matrix)).todense() - 1)
    #
    # dis_matrix[dis_matrix < 1e-10] = 0
    #
    # np.array(): Create an array.
    # dis_matrix = np.array(dis_matrix)
    #
    filename = tree_file.split(os.sep)[-1].replace(".txt", "") + ".dismat"
    print filename
    write_matrix(dis_matrix, os.path.join(matrix_folder, method_name),
                 filename)

    return dis_matrix