def __init__(self, text, common_words=COMMON_WORDS, whitelist_chars=WHITELIST_CHARS): self.__text = text #self.__preprocessed = utils.lemmatize(text) self.__preprocessed = preprocess_string(text)
def vector_search(q, model, index, k=10): processed_q = preprocess_string(q) print("Query preprocessed") vector = [] if hasattr(model, 'encode'): vector = model.encode([processed_q]) else: vector = model.transform([processed_q]).toarray() vector = vector.astype("float32") faiss.normalize_L2(vector) D, I = index.search(vector, k=k) return D, I
def pull_and_preprocess_from_mongo(start_index, num_docs): docs = collection.aggregate(mongo_query(start_index, num_docs)) doc_list = [] id_list = [] for doc in docs: clean_text = preprocess_string(doc['text'] or "", stopping=True, stemming=True, lowercasing=True) doc_list.append(clean_text) id_list.append(doc['_id']) return list(zip(doc_list, id_list))
def extract_pages(ls_pageid, gs_call): """ Extract nodes read only documents from input list of gid, preprocess text, remove punctuation, remove stopwords, tokenize. """ from tempfile import gettempdir tmp_dir = gettempdir() output = open(tmp_dir + '/test.txt', 'w') logging.info("extract pages") for page_id in ls_pageid: logging.info(page_id) try: text = gs_call.get_nodes_binary_data([page_id]) except DecodeError: continue page = text.entries[0].data.data.decode('utf-8') text = preprocess_string(page) # ylog.debug(text) output.write(text + '\n') output.close()
def qa(self, q: str) -> QAResult: # Find top 2 relevant documents documents = main_search( q, sources=["nhs_az", "nhs_med", "nhs_covid19", "bnf"]) reference = [ doc["description"] + " " + ensure_good_content(doc['content']['text'])[0] [:min(300, len(ensure_good_content(doc['content']['text'])[0]))] for doc in documents[0:2] ] # preprocess docs before search qa_clean_q = preprocess_string(q, lowercasing=False, stemming=False, stopping=False) print(f"Raw references in QA: {reference}") clean_refs = [preprocess_QA_text(ref) for ref in reference] print(f"Cleaned references to search in: {clean_refs}") # answer = qa_model.predict(" ".join(clean_refs), qa_clean_q) answer = qa_model.predict(" ".join(clean_refs), qa_clean_q) return QAResult(answer=answer['answer'], confidence=answer['confidence'])
for s in snipets[:nsnipets]: snipet_aug += f"{s:<60}" if len(tokens) > 0: doc = doc.replace(f"{configs.DATA_PATH}/", "") found_documents.append({ "frequency": frequency, "doc": doc, "snipet_aug": snipet_aug }) # found_documents.append(f"{frequency:<15}{doc:<50}{snipet_aug}") end_time = time.time() print() print(f"Found {n_found_docs} documents in: {end_time - start_time}") header = f"{'Frequency':<15}{'Document name':<50}" for i in range(nsnipets): snip_name = f"Snipet {i}" header += f"{snip_name:<60}" print(header) print("-" * (15 + 60 * nsnipets + 50)) found_documents.sort(key=lambda x: x["frequency"], reverse=True) for i in found_documents: print(f"{i['frequency']:<15}{i['doc']:<50}{i['snipet_aug']}") querry = sys.argv[1] querry = preprocessing.preprocess_string(querry) run_basic(querry)
en_model_file_path = '/home/weiwu/tools/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz' ch_model_file_path = '/home/weiwu/share/software/chinese.misc.distsim.crf.ser.gz' jar_path = '/home/weiwu/tools/stanford-ner-2017-06-09/stanford-ner-3.8.0.jar' en_tagger = StanfordNERTagger( model_filename=en_model_file_path, path_to_jar=jar_path) print( en_tagger.tag( 'Rami Eid is studying at Stony Brook University in NY'.split())) # [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')] # StanfordNERTagger 中文命名实体识别 begin = default_timer() str_test = u'''云南铜业股份有限公司(深交所:000878),简称云铜股份、云铜,前身为云南冶炼厂,成立于1958年,1998年改制为股份公司,更名为现称,1998年6月2日于深圳证券交易所上市。公司是中国第四大铜业企业,生产高纯阴极铜、电工用铜线坏、工业硫酸、金锭、银锭、电工用圆铜线、硫酸铜等主产品,并能综合回收金、银、铝、铋、铂、钯等多种有色金属。2007年10月,中国铝业收购云铜母公司云南铜业集团的49%股权,改名“中铝云南铜业集团”。''' filter_setting = [tokenize, strip_punctuation] text = preprocess_string(str_test, filter_setting) ch_tagger = StanfordNERTagger( model_filename=ch_model_file_path, path_to_jar=jar_path) end = default_timer() result = ch_tagger.tag(text.split()) for word, tag in result: print(word, tag) load_duration = end - begin print("Total procesing time: %.1fs seconds" % (end - begin)) # StanfordPOSTagger 中文词性标注 pos_tagger_model_file_path = '/home/weiwu/share/software/chinese-distsim.tagger' pos_tagger_jar_file_path = '/home/weiwu/tools/stanford-postagger-full-2017-06-09/stanford-postagger-3.8.0.jar' ch_pos_tagger = StanfordPOSTagger( model_filename=pos_tagger_model_file_path,