Ejemplo n.º 1
0
def process_sentences(counter, sent, s_list, corp, n_corp, data_file, text_file):  #,
                      # extra_file1, extra_file2, extra_file3, browser):
    norm_sent = [normalize(s, stops) for s in sent]
    for j in range(len(sent)):
        s = sent[j]
        try:
            sb = sum_basic(norm_sent[j], n_corp)
        except:
            print "F**K"
            print s
            print_list(norm_sent)
            print "*******"
            print_list(sent)

        sm = sim(norm_sent[j], corp)
        (k, p) = position(s, s_list)
        # NOTE: unnormalized sentences
        # links1 = external(s, 1, stops, browser)
        # links2 = external(s, 2, stops, browser)
        # links3 = external(s, 3, stops, browser)
        cval = counter[data_file]
        data_file.write('%d,%d,%s,' % (cval, k, files[i]) + ','.join(map(lambda x: '%.3f' % x, [sb, sm, p])) + '\n')
        text_file.write('%d,%d,%s,' % (cval, k, files[i]) + s.encode('utf-8') + '\n')
        # extra_file1.write('%d,%s' % (cval, links1))
        # extra_file2.write('%d,%s' % (cval, links2))
        # extra_file3.write('%d,%s' % (cval, links3))

        counter[data_file] += 1
Ejemplo n.º 2
0
def corp(files):
    res = []
    norm = []
    for f in files:
        print "load %s" % f
        t = md.load(f)
        res.append(t)
        norm.append(normalize(t))
    return (res, norm)
Ejemplo n.º 3
0
def inheader(word, text):
    header = md.header(text)
    nhead = normalize(header)
    return word in nhead
Ejemplo n.º 4
0
    ngrams_file = codecs.open('ngrams_data.csv', "w", "utf-8")
    ngrams_file.write('id,file,word,normal_form,n,tf,d,idf,tf-idf,bm25,in_header,pos,is_key\n')

    counter = {keywords_file: 0, ngrams_file: 0}

    for j in range(len(files)):
        print 'Processing %s' % files[j]
        text = corpus[j]
        norm = norm_corp[j]
        keys = md.keywords(text)
        # print_list(text)
        # print '&&&&&&&&&&&&&'
        # print_list(keys)
        # print '----'
        normkeys = [normalize(k) for k in keys]
        # print_list(normkeys)
        write_stat(counter, text, norm, norm_corp, keys, normkeys, files[j], 1, keywords_file)

        # теперь тоже самое для случайно выбраных n-gram (n = 1,2,3)
        # в последнем столбце просто ставим 0
        ng = md.ngrams(1, text, stops) + md.ngrams(2, text, stops) + md.ngrams(3, text, stops)
        ngrams = random.sample(ng, 15)
        normgr = [normalize(k) for k in ngrams]
        normgrams = []
        temp = ngrams
        ngrams = []
        # проверяем, чтобы никакая часть фразы в ключи не попала
        k = 0
        for phrase in normgr:
            phr = phrase.split(' ')
Ejemplo n.º 5
0
def sim(norm_sent, text):
    title = md.header(text)
    norm_title = normalize(title)
    return sum_basic(norm_sent, norm_title)
Ejemplo n.º 6
0
        exit(1)
    content = load(sys.argv[2])
    opt = sys.argv[1]
    if opt == "-k":
        rs = keywords(content)
        for k in rs:
            print k.encode("utf-8")
    elif opt == "-s":
        ss = sentences(content)
        for s in ss:
            print s.encode("utf-8")
    # elif opt == '-q':
    #     ss = unmarked_sentences(content)
    #     for s in ss:
    #         print s.encode('utf-8')
    elif opt.startswith("-r"):
        n, k = parseopt(opt)
        # print "n=%d, k=%d" % (n, k)
        stops = loadstops("stops_ru.txt")
        ng = ngrams(n, content, stops)
        samples = random.sample(ng, k)
        for s in samples:
            print s.encode("utf-8")
        # for k in ng[:5]:
        #     print k.encode('utf-8')
    elif opt == "-n":
        print normalize(content)
    else:
        print "Unrecognized option: %s " % opt
        usage(sys.argv[0])