def test_part_group(): from backend import article as a article = a.PubMedArticle() article.setTitle("i don't want to do any ... !dog pen!") article.add_abstract_text("a pen-pen pen is a #dog") article.add_abstract_text("i have a \n pen") article.tokenize(tk.SpaceTokenizer()) tokens = ["i", "have", "a", "dog", "pen"] title_group, abstract_group = util.find_token_pos_in_pubmed_article( tokens, article) print(title_group) print(abstract_group)
def main_test(): loader = tl.TextLoader() print('main test') corpus = loader.load_corpus_from_directory('foo', './backend/data/pubmed/gene') factory = parse.ParserFactory() corpus.parseAll(factory) #articles = corpus.articles tokenizer = tk.SpaceTokenizer() corpus.tokenizeAll(tokenizer) corpus.build_vocab() #test_near_corpus_token("the",corpus) #test_near_corpus_token("hte",corpus) #test_near_corpus_token("genetic",corpus) #test_near_corpus_token("ganetic",corpus) #test_near_corpus_token("gan",corpus) #util.zip_dist_corpus(corpus,'test') indexer = idx.Indexer(corpus) #test_query_sentence(tokenizer,indexer) #queryer = q.Queryer(indexer) #print(queryer.get_spellchecked_tokens("gane is pretein",tokenizer,error_rate=0.6)) test_query_sentence(tokenizer, indexer)
def analyze(self, tokenizer=tk.SpaceTokenizer()): self.zipf = util.Zipf() self.zipf.add_tokens(tokenizer.tokenize(self.text))
print(sys.path[0]) from backend import util from backend import tokenizer as tk import re parser = argparse.ArgumentParser() parser.add_argument("corpus", help="corpus name") args = parser.parse_args() #corpus name example pubmed/gene f = lambda _path: re.compile(r"[\\\/]").split(_path) corpus_name = args.corpus corpus_path = os.path.join('./data',corpus_name) corpus,indexer = util.build_corpus_and_indexer(corpus_name,corpus_path,tk.SpaceTokenizer()) save_path_root = '../main/static' figure_dir = os.path.join(save_path_root ,corpus_name) save_path = os.path.join(save_path_root ,corpus_name,'%s.png'%(f(corpus_name)[1])) #print(figure_dir) if not os.path.isdir(figure_dir): #print('figures') #print(figure_dir) os.makedirs(figure_dir) util.save_dist_figure(corpus,corpus_name,save_path)
def query(): #rank_model = req #tf_option #idf_option item_per_page = 10 query = request.form['query'] rank_model = request.form['rank_model'] tf_option = request.form['tf_option'] idf_option = request.form['idf_option'] print('rank model') print(rank_model) print(tf_option) print(idf_option) if 'page_idx' in request.form: page_idx = int(request.form['page_idx']) else: page_idx = 1 try: k_num = int(request.form['top_k_num']) except: k_num = 10 token_algorithm = request.form['token_algorithm'] ir_sys_for_query = None # if token_algorithm == 'normal': ir_sys_for_query = ir_sys elif token_algorithm=='porter': ir_sys_for_query = ir_sys_porter else: assert False if rank_model == 'match': titles_by_order,abstracts_by_order,corpus_names, match_total, token_matches, tokens = \ ir_sys_for_query .make_query_order_by_match_total(query,k_num) elif rank_model == 'tfidf': titles_by_order,abstracts_by_order,corpus_names, match_total, token_matches, tokens = \ ir_sys_for_query .make_query_order_by_tfidf(query,k_num,which_tf=tf_option,which_idf=idf_option) else: assert False total_item_num = len(titles_by_order) tokenizer = tk.SpaceTokenizer() alternative_query = '' start_idx = (page_idx-1)*item_per_page if start_idx >= total_item_num: page_idx = (total_item_num-1)//item_per_page+1 start_idx = (page_idx-1)*item_per_page if start_idx<=0: page_idx = 1 start_idx = 0 end_idx = start_idx + item_per_page if end_idx>= total_item_num: end_idx = total_item_num-1 titles_by_order = titles_by_order[start_idx:end_idx+1] if not ir_sys.all_in_vocab_set(tokenizer.tokenize(query)): alternative_query_tokens = ir_sys.alternative_query(query,tokenizer) alternative_query = " ".join(alternative_query_tokens) #pagination return render_template("query_list.html", query = query, page_idx = page_idx, last_page = (total_item_num-1)//item_per_page+1, top_k_num = k_num, token_algorithm=token_algorithm , alternaive_query=alternative_query, titles_by_order=titles_by_order, abstracts_by_order=abstracts_by_order, corpus_names=corpus_names, match_total=match_total, token_matches=token_matches, tokens=tokens, list_result_flag=True)
import _pickle as pickle from backend.ir import IRSystem from backend import tokenizer as tk from backend import util ir_sys1 = IRSystem(tk.SpaceTokenizer()) ir_sys2 = IRSystem(tk.PorterTokenizer()) def dump_ir_sys(ir_sys,direcroty_layer2): print('load') print('dump1') with open("./temp/%s/ir_sys.pkl"%(direcroty_layer2), mode='wb') as f: pickle.dump(ir_sys,f) print('dump2') with open("./temp/%s/corpus_names.pkl"%(direcroty_layer2), mode='wb') as f: pickle.dump(ir_sys.corpus_names,f) with open("./temp/%s/corpus.pkl"%(direcroty_layer2), mode='wb') as f: corpus_list = [ q.indexer.corpus for _,q in ir_sys.queryers.items()] pickle.dump(corpus_list,f) dump_ir_sys(ir_sys1, 'normal') dump_ir_sys(ir_sys2, 'porter') #with open('./temp/normal/pubmed/gene/indexer.pkl', mode='rb') as f: # indexer = pickle.load(f) # for path,articles in indexer.corpus.articles.items():