Example #1
0
 def __init__(self,
              text,
              common_words=COMMON_WORDS,
              whitelist_chars=WHITELIST_CHARS):
     self.__text = text
     #self.__preprocessed = utils.lemmatize(text)
     self.__preprocessed = preprocess_string(text)
Example #2
0
def vector_search(q, model, index, k=10):
    processed_q = preprocess_string(q)
    print("Query preprocessed")
    vector = []
    if hasattr(model, 'encode'):
        vector = model.encode([processed_q])
    else:
        vector = model.transform([processed_q]).toarray()
    vector = vector.astype("float32")
    faiss.normalize_L2(vector)
    D, I = index.search(vector, k=k)
    return D, I
Example #3
0
def pull_and_preprocess_from_mongo(start_index, num_docs):

    docs = collection.aggregate(mongo_query(start_index, num_docs))
    doc_list = []
    id_list = []
    for doc in docs:
        clean_text = preprocess_string(doc['text'] or "",
                                       stopping=True,
                                       stemming=True,
                                       lowercasing=True)
        doc_list.append(clean_text)
        id_list.append(doc['_id'])

    return list(zip(doc_list, id_list))
Example #4
0
def extract_pages(ls_pageid, gs_call):
    """ Extract nodes read only documents from input list of gid,
    preprocess text, remove punctuation, remove stopwords, tokenize.
    """
    from tempfile import gettempdir
    tmp_dir = gettempdir()
    output = open(tmp_dir + '/test.txt', 'w')
    logging.info("extract pages")
    for page_id in ls_pageid:
        logging.info(page_id)
        try:
            text = gs_call.get_nodes_binary_data([page_id])
        except DecodeError:
            continue
        page = text.entries[0].data.data.decode('utf-8')
        text = preprocess_string(page)
        # ylog.debug(text)
        output.write(text + '\n')
    output.close()
Example #5
0
    def qa(self, q: str) -> QAResult:
        # Find top 2 relevant documents
        documents = main_search(
            q, sources=["nhs_az", "nhs_med", "nhs_covid19", "bnf"])
        reference = [
            doc["description"] + " " +
            ensure_good_content(doc['content']['text'])[0]
            [:min(300, len(ensure_good_content(doc['content']['text'])[0]))]
            for doc in documents[0:2]
        ]

        # preprocess docs before search
        qa_clean_q = preprocess_string(q,
                                       lowercasing=False,
                                       stemming=False,
                                       stopping=False)
        print(f"Raw references in QA: {reference}")
        clean_refs = [preprocess_QA_text(ref) for ref in reference]
        print(f"Cleaned references to search in: {clean_refs}")
        # answer = qa_model.predict(" ".join(clean_refs), qa_clean_q)
        answer = qa_model.predict(" ".join(clean_refs), qa_clean_q)

        return QAResult(answer=answer['answer'],
                        confidence=answer['confidence'])
Example #6
0
        for s in snipets[:nsnipets]:
            snipet_aug += f"{s:<60}"
        if len(tokens) > 0:
            doc = doc.replace(f"{configs.DATA_PATH}/", "")
            found_documents.append({
                "frequency": frequency,
                "doc": doc,
                "snipet_aug": snipet_aug
            })
            # found_documents.append(f"{frequency:<15}{doc:<50}{snipet_aug}")
    end_time = time.time()

    print()

    print(f"Found {n_found_docs} documents in: {end_time - start_time}")
    header = f"{'Frequency':<15}{'Document name':<50}"
    for i in range(nsnipets):
        snip_name = f"Snipet {i}"
        header += f"{snip_name:<60}"
    print(header)
    print("-" * (15 + 60 * nsnipets + 50))

    found_documents.sort(key=lambda x: x["frequency"], reverse=True)
    for i in found_documents:
        print(f"{i['frequency']:<15}{i['doc']:<50}{i['snipet_aug']}")


querry = sys.argv[1]
querry = preprocessing.preprocess_string(querry)

run_basic(querry)
Example #7
0
en_model_file_path = '/home/weiwu/tools/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz'
ch_model_file_path = '/home/weiwu/share/software/chinese.misc.distsim.crf.ser.gz'
jar_path = '/home/weiwu/tools/stanford-ner-2017-06-09/stanford-ner-3.8.0.jar'
en_tagger = StanfordNERTagger(
    model_filename=en_model_file_path, path_to_jar=jar_path)
print(
    en_tagger.tag(
        'Rami Eid is studying at Stony Brook University in NY'.split()))
# [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]

# StanfordNERTagger 中文命名实体识别
begin = default_timer()
str_test = u'''云南铜业股份有限公司(深交所:000878),简称云铜股份、云铜,前身为云南冶炼厂,成立于1958年,1998年改制为股份公司,更名为现称,1998年6月2日于深圳证券交易所上市。公司是中国第四大铜业企业,生产高纯阴极铜、电工用铜线坏、工业硫酸、金锭、银锭、电工用圆铜线、硫酸铜等主产品,并能综合回收金、银、铝、铋、铂、钯等多种有色金属。2007年10月,中国铝业收购云铜母公司云南铜业集团的49%股权,改名“中铝云南铜业集团”。'''
filter_setting = [tokenize, strip_punctuation]
text = preprocess_string(str_test, filter_setting)
ch_tagger = StanfordNERTagger(
    model_filename=ch_model_file_path, path_to_jar=jar_path)
end = default_timer()
result = ch_tagger.tag(text.split())
for word, tag in result:
    print(word, tag)
load_duration = end - begin
print("Total procesing time: %.1fs seconds" % (end - begin))

# StanfordPOSTagger 中文词性标注
pos_tagger_model_file_path = '/home/weiwu/share/software/chinese-distsim.tagger'
pos_tagger_jar_file_path = '/home/weiwu/tools/stanford-postagger-full-2017-06-09/stanford-postagger-3.8.0.jar'

ch_pos_tagger = StanfordPOSTagger(
    model_filename=pos_tagger_model_file_path,
Example #8
0
 def __init__(self, text, common_words=COMMON_WORDS, whitelist_chars=WHITELIST_CHARS):
     self.__text = text
     #self.__preprocessed = utils.lemmatize(text)
     self.__preprocessed = preprocess_string(text)