Ejemplo n.º 1
0
def test_part_group():
    from backend import article as a
    article = a.PubMedArticle()
    article.setTitle("i don't want to do any ... !dog  pen!")
    article.add_abstract_text("a pen-pen  pen is a #dog")
    article.add_abstract_text("i have a \n pen")
    article.tokenize(tk.SpaceTokenizer())
    tokens = ["i", "have", "a", "dog", "pen"]
    title_group, abstract_group = util.find_token_pos_in_pubmed_article(
        tokens, article)

    print(title_group)
    print(abstract_group)
Ejemplo n.º 2
0
def main_test():
    loader = tl.TextLoader()
    print('main test')
    corpus = loader.load_corpus_from_directory('foo',
                                               './backend/data/pubmed/gene')
    factory = parse.ParserFactory()
    corpus.parseAll(factory)
    #articles = corpus.articles
    tokenizer = tk.SpaceTokenizer()
    corpus.tokenizeAll(tokenizer)
    corpus.build_vocab()
    #test_near_corpus_token("the",corpus)
    #test_near_corpus_token("hte",corpus)
    #test_near_corpus_token("genetic",corpus)
    #test_near_corpus_token("ganetic",corpus)
    #test_near_corpus_token("gan",corpus)
    #util.zip_dist_corpus(corpus,'test')
    indexer = idx.Indexer(corpus)
    #test_query_sentence(tokenizer,indexer)
    #queryer = q.Queryer(indexer)
    #print(queryer.get_spellchecked_tokens("gane is pretein",tokenizer,error_rate=0.6))
    test_query_sentence(tokenizer, indexer)
Ejemplo n.º 3
0
 def analyze(self, tokenizer=tk.SpaceTokenizer()):
     self.zipf = util.Zipf()
     self.zipf.add_tokens(tokenizer.tokenize(self.text))
Ejemplo n.º 4
0
print(sys.path[0])
from backend import util
from backend import tokenizer as tk
import re

parser = argparse.ArgumentParser()
parser.add_argument("corpus", help="corpus name")
args = parser.parse_args()

#corpus name example pubmed/gene

f = lambda _path: re.compile(r"[\\\/]").split(_path)

corpus_name = args.corpus
corpus_path = os.path.join('./data',corpus_name)
corpus,indexer = util.build_corpus_and_indexer(corpus_name,corpus_path,tk.SpaceTokenizer())




save_path_root = '../main/static'
figure_dir = os.path.join(save_path_root ,corpus_name)
save_path = os.path.join(save_path_root ,corpus_name,'%s.png'%(f(corpus_name)[1]))
#print(figure_dir)
if  not os.path.isdir(figure_dir):
    #print('figures')
    #print(figure_dir)
    os.makedirs(figure_dir)
util.save_dist_figure(corpus,corpus_name,save_path)

Ejemplo n.º 5
0
def query():
    #rank_model = req
    #tf_option
    #idf_option
    item_per_page = 10
    query = request.form['query']
    rank_model =  request.form['rank_model']
    tf_option =  request.form['tf_option']
    idf_option = request.form['idf_option']
    print('rank model')
    print(rank_model)
    print(tf_option)
    print(idf_option)

    if 'page_idx' in  request.form: 
        page_idx = int(request.form['page_idx'])
    else:
        page_idx = 1

    try:
        k_num = int(request.form['top_k_num'])
    except:
        k_num = 10

    token_algorithm = request.form['token_algorithm']

    ir_sys_for_query = None #
    if token_algorithm == 'normal':
        ir_sys_for_query = ir_sys
    elif token_algorithm=='porter':
        ir_sys_for_query = ir_sys_porter
    else:
        assert False


    if rank_model == 'match':
        titles_by_order,abstracts_by_order,corpus_names, match_total, token_matches, tokens =  \
                    ir_sys_for_query .make_query_order_by_match_total(query,k_num) 
    elif rank_model == 'tfidf':
        titles_by_order,abstracts_by_order,corpus_names, match_total, token_matches, tokens =  \
                    ir_sys_for_query .make_query_order_by_tfidf(query,k_num,which_tf=tf_option,which_idf=idf_option) 
    else:
        assert False

    total_item_num = len(titles_by_order)

    tokenizer = tk.SpaceTokenizer()
    alternative_query = ''

    start_idx = (page_idx-1)*item_per_page
    if start_idx >= total_item_num:
        page_idx =  (total_item_num-1)//item_per_page+1
        start_idx = (page_idx-1)*item_per_page
    if start_idx<=0:
        page_idx = 1
        start_idx = 0
    end_idx = start_idx + item_per_page

    if end_idx>= total_item_num:
        end_idx = total_item_num-1

    titles_by_order = titles_by_order[start_idx:end_idx+1]

    if not ir_sys.all_in_vocab_set(tokenizer.tokenize(query)):
        alternative_query_tokens =  ir_sys.alternative_query(query,tokenizer)
        alternative_query = " ".join(alternative_query_tokens)
    #pagination
   
    return  render_template("query_list.html",
                            query = query,
                            page_idx = page_idx,
                            last_page =  (total_item_num-1)//item_per_page+1,
                            top_k_num = k_num,
                            token_algorithm=token_algorithm ,
                            alternaive_query=alternative_query,
                            titles_by_order=titles_by_order,
                            abstracts_by_order=abstracts_by_order,
                            corpus_names=corpus_names,
                            match_total=match_total,
                            token_matches=token_matches,
                            tokens=tokens,
                            list_result_flag=True)
Ejemplo n.º 6
0
import _pickle as  pickle 
from  backend.ir import  IRSystem
from backend import tokenizer as tk
from backend import util


ir_sys1 = IRSystem(tk.SpaceTokenizer())
ir_sys2 = IRSystem(tk.PorterTokenizer())

def dump_ir_sys(ir_sys,direcroty_layer2):
    print('load')
    print('dump1')
    with open("./temp/%s/ir_sys.pkl"%(direcroty_layer2), mode='wb') as f:
       pickle.dump(ir_sys,f)

    print('dump2')
    with open("./temp/%s/corpus_names.pkl"%(direcroty_layer2), mode='wb') as f:
       pickle.dump(ir_sys.corpus_names,f)

    with open("./temp/%s/corpus.pkl"%(direcroty_layer2), mode='wb') as f:
       corpus_list = [ q.indexer.corpus  for _,q in ir_sys.queryers.items()]
       pickle.dump(corpus_list,f)



dump_ir_sys(ir_sys1, 'normal')
dump_ir_sys(ir_sys2, 'porter')

#with open('./temp/normal/pubmed/gene/indexer.pkl', mode='rb') as f:
#  indexer = pickle.load(f)
#  for path,articles in indexer.corpus.articles.items():