Ejemplos de SignatureTranslatedAnonymousPackage.loadCorpus en Python

Lenguaje de programación: Python

Namespace/Package Name: rpy2.robjects.packages

Clase / Tipo: SignatureTranslatedAnonymousPackage

Método / Función: loadCorpus

Ejemplos en hotexamples.com: 1

Python SignatureTranslatedAnonymousPackage.loadCorpus - 1 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de rpy2.robjects.packages.SignatureTranslatedAnonymousPackage.loadCorpus extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

SignatureTranslatedAnonymousPackage(30)

apprentissage(3)

pf_r(3)

load_RData(2)

Detect(2)

doit(2)

r99ptot(1)

r20mm(1)

r10mm(1)

qr_pogs(1)

Cut(1)

ptsPPP(1)

rnnmm(1)

predictPrice(1)

prcptot(1)

model_eval(1)

medicc_clonal_expansion(1)

prepNewAds(1)

safepredict(1)

rx1day(1)

rx5day(1)

saveFile(1)

sdii(1)

splitTerms(1)

square(1)

statcalc(1)

su(1)

test(1)

tnn(1)

tnx(1)

tr(1)

traj_r(1)

txn(1)

txx(1)

longworkStemBatchEn(1)

linear_modeler(1)

localization(1)

cube(1)

MGG_unpresent_Augustus_locate_in_pav_orthofinder(1)

McCOIL_categorical(1)

RFmodel(1)

RFpred(1)

R_remove_MGG_unpresent_Augustus(1)

RunIhacresGw(1)

W(1)

calculate(1)

cdd(1)

cleanJobAds(1)

corr_main_func(1)

createJoinDTM(1)

Ejemplo n.º 1

Mostrar archivo

def dissimilarity_41(tree_file, matrix_folder, method_name):
    # split tree file into individual texts
    corpus = read_file(tree_file).split("<\\tphyldoc>")

    nonstemmed_corpus = []

    print "os.path(tree_file): ", tree_file

    for file in corpus:

        # turn all texts into lowercase
        file = file.lower()

        # remove all punctuations
        file = file.translate(None, string.punctuation)
        file += "\n"
        # get new corpus ready to use: nonstemmed_corpus
        nonstemmed_corpus.append(file)

    corpus_number = len(nonstemmed_corpus)
    dis_matrix = [[0 for x in range(corpus_number)]
                  for y in range(corpus_number)]
    corpus_index_list1 = np.arange(corpus_number)

    path = os.path.dirname(os.path.abspath(__file__))
    path_index = os.path.splitext(os.path.basename(tree_file))[0]
    nonstemmed_corpus_path = os.path.join(
        path, "generated_corpus_{}".format(path_index))

    if not os.path.exists(nonstemmed_corpus_path):
        os.makedirs(nonstemmed_corpus_path)

    for element in nonstemmed_corpus:
        write_file(
            nonstemmed_corpus_path,
            "nonstemmed_{}.txt".format(nonstemmed_corpus.index(element)),
            element)

    # calling R scripts to load corpus
    outputDir = os.path.join(path, "processed_corpus_{}".format(path_index))
    outputDir += "/"
    inputDir = nonstemmed_corpus_path

    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    # stem the corpus generate processed corpus
    with open('longworkStemBatchEn.R') as fh:
        rcode = os.linesep.join(fh.readlines())
        myfunc_stem = SignatureTranslatedAnonymousPackage(rcode, "myfunc_stem")
        myfunc_stem.longworkStemBatchEn(inputDir, outputDir)

    # load the corpus
    with open('loadCorpus.R') as lh:
        rcode = os.linesep.join(lh.readlines())
        myfunc_load = SignatureTranslatedAnonymousPackage(rcode, "myfunc_load")
        r_corpus = myfunc_load.loadCorpus(outputDir)

    # get two files as sample1 and sample2 to calculate vec using attribution features
    for file1_id in corpus_index_list1:
        file1 = nonstemmed_corpus[file1_id]
        # generate a full list of corpus
        corpus_index_list2 = np.arange(corpus_number)
        corpus_index_list2 = corpus_index_list2.tolist()
        # remove file1's index from the list
        corpus_index_list2.remove(file1_id)
        dis_matrix[file1_id][file1_id] = 0
        for file2_id in corpus_index_list2:
            file2 = nonstemmed_corpus[file2_id]
            # run genFeatures.R
            with open('genFeatures.R') as gh:
                importr("gtools")
                rcode = os.linesep.join(gh.readlines())
                myfunc_genf = SignatureTranslatedAnonymousPackage(
                    rcode, "myfunc_genf")

                # vec is the resulting vector generated by genFeatures.R
                vec = myfunc_genf.genFeatures(file1, file2, r_corpus, 1)

                # manipulating vec to generate wanted value
                file12_array = numpy.array(vec)
                str12 = ''.join(str(e) for e in file12_array)
                testlist = str12.split()
                del testlist[0]
                newlist = []
                for element in testlist:
                    newele = element.split(":", 1)[-1]
                    if newele.find('.'):
                        float(newele)
                        newlist.append(newele)
                    else:
                        int(newele)
                        newlist.append(newele)

                newarray = np.asarray(newlist, dtype=float)
                value = np.dot(newarray, np.transpose(newarray))
                dis_matrix[file1_id][file2_id] = value
                dis_matrix[file2_id][file1_id] = value
                print "dis_matrix", file1_id, file2_id, "=", value

    # get dissimilarity matrix
    # abs(): Return the absolute value of a number
    # dis_matrix = abs(
    #     (tf_idf_matrix * np.transpose(tf_idf_matrix)).todense() - 1)
    #
    # dis_matrix[dis_matrix < 1e-10] = 0
    #
    # np.array(): Create an array.
    # dis_matrix = np.array(dis_matrix)
    #
    filename = tree_file.split(os.sep)[-1].replace(".txt", "") + ".dismat"
    print filename
    write_matrix(dis_matrix, os.path.join(matrix_folder, method_name),
                 filename)

    return dis_matrix