def process_sentences(counter, sent, s_list, corp, n_corp, data_file, text_file): #, # extra_file1, extra_file2, extra_file3, browser): norm_sent = [normalize(s, stops) for s in sent] for j in range(len(sent)): s = sent[j] try: sb = sum_basic(norm_sent[j], n_corp) except: print "F**K" print s print_list(norm_sent) print "*******" print_list(sent) sm = sim(norm_sent[j], corp) (k, p) = position(s, s_list) # NOTE: unnormalized sentences # links1 = external(s, 1, stops, browser) # links2 = external(s, 2, stops, browser) # links3 = external(s, 3, stops, browser) cval = counter[data_file] data_file.write('%d,%d,%s,' % (cval, k, files[i]) + ','.join(map(lambda x: '%.3f' % x, [sb, sm, p])) + '\n') text_file.write('%d,%d,%s,' % (cval, k, files[i]) + s.encode('utf-8') + '\n') # extra_file1.write('%d,%s' % (cval, links1)) # extra_file2.write('%d,%s' % (cval, links2)) # extra_file3.write('%d,%s' % (cval, links3)) counter[data_file] += 1
def corp(files): res = [] norm = [] for f in files: print "load %s" % f t = md.load(f) res.append(t) norm.append(normalize(t)) return (res, norm)
def inheader(word, text): header = md.header(text) nhead = normalize(header) return word in nhead
ngrams_file = codecs.open('ngrams_data.csv', "w", "utf-8") ngrams_file.write('id,file,word,normal_form,n,tf,d,idf,tf-idf,bm25,in_header,pos,is_key\n') counter = {keywords_file: 0, ngrams_file: 0} for j in range(len(files)): print 'Processing %s' % files[j] text = corpus[j] norm = norm_corp[j] keys = md.keywords(text) # print_list(text) # print '&&&&&&&&&&&&&' # print_list(keys) # print '----' normkeys = [normalize(k) for k in keys] # print_list(normkeys) write_stat(counter, text, norm, norm_corp, keys, normkeys, files[j], 1, keywords_file) # теперь тоже самое для случайно выбраных n-gram (n = 1,2,3) # в последнем столбце просто ставим 0 ng = md.ngrams(1, text, stops) + md.ngrams(2, text, stops) + md.ngrams(3, text, stops) ngrams = random.sample(ng, 15) normgr = [normalize(k) for k in ngrams] normgrams = [] temp = ngrams ngrams = [] # проверяем, чтобы никакая часть фразы в ключи не попала k = 0 for phrase in normgr: phr = phrase.split(' ')
def sim(norm_sent, text): title = md.header(text) norm_title = normalize(title) return sum_basic(norm_sent, norm_title)
exit(1) content = load(sys.argv[2]) opt = sys.argv[1] if opt == "-k": rs = keywords(content) for k in rs: print k.encode("utf-8") elif opt == "-s": ss = sentences(content) for s in ss: print s.encode("utf-8") # elif opt == '-q': # ss = unmarked_sentences(content) # for s in ss: # print s.encode('utf-8') elif opt.startswith("-r"): n, k = parseopt(opt) # print "n=%d, k=%d" % (n, k) stops = loadstops("stops_ru.txt") ng = ngrams(n, content, stops) samples = random.sample(ng, k) for s in samples: print s.encode("utf-8") # for k in ng[:5]: # print k.encode('utf-8') elif opt == "-n": print normalize(content) else: print "Unrecognized option: %s " % opt usage(sys.argv[0])