def search(index: str, text: str): query = preprocess_text(text.strip().lower()) vector = get_vector(text)["vector"] langs = get_config("LANGUAGES") search_fields = list() for lang in langs: search_fields += [f'title_{lang}', f'content_{lang}'] query_json = { "_source": ["url", "authors", "citedby", "year", "lang"] + [f'title_{l}' for l in langs], "query": { "script_score": { "query": { "bool": { "should": [{ "match": { f: query } } for f in search_fields] } }, "script": { "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0", "params": { "query_vector": vector } } } }, "highlight": { "fragment_size": 100, "fields": {f: {} for f in search_fields} }, "size": 100 } url = get_config("ELASTICSEARCH") + f'/{index}/_search' response = rq.get(url, json=query_json).json() logger.info(f'Resp: {response}') return response.get("hits", {}).get("hits", [])
def search_publication(): pubs = search("publication", request.args["text"]) langs = get_config("LANGUAGES") authors = AutoSortedDict(sort_field="score") author_count, pub_count = 0, 0 for pub in pubs: pub_count += 1 pub.pop("_index") pub_lang = pub["_source"]["lang"] pub_authors = pub["_source"].get("authors", list()) for pub_author in pub_authors: author_id = pub_author.pop("id") if authors.get(author_id, None) is not None: current_author = authors[author_id] del authors[author_id] current_author["pub_counts"] += 1 current_author["score"] += pub["_score"] current_author["pubs"].append({ "id": pub["_id"], "title": pub["_source"][f'title_{pub_lang}'] }) authors[author_id] = current_author elif author_count < 100: author_count += 1 pub_author["score"] = pub["_score"] pub_author["pubs"] = [{ "id": pub["_id"], "title": pub["_source"][f'title_{pub_lang}'] }] pub_author["pub_counts"] = 1 authors[author_id] = pub_author pub["_source"]["title"] = pub["_source"].get(f'title_{pub_lang}', "unknown") for k in langs: pub["_source"].pop(f'title_{k}', None) pub["_source"].pop(f'content_{k}', None) return jsonify({ "authors": { "count": author_count, "items": authors.values()[:100] }, "pubs": { "count": pub_count, "items": pubs } })
def get_vector(text: str): url = get_config("VECTORIZER") url += "/vectorize" response = requests.get( url=url, json={"text": text} ).json() return response
def lang_detect(text): url = get_config("VECTORIZER") url += "/detect/lang" response = requests.get( url=url, json={"text": text} ).json() return response.get("lang", "unknown")
def get_docs(ids, projections=None): q = {"query": {"ids": {"values": list(ids)}}} if projections: q["_source"] = projections url = get_config("ELASTICSEARCH") + "/_search" response = rq.get(url, json=q).json() return response.get("hits", {}).get("hits", [])
def translate(text, dest_lang): url = get_config("VECTORIZER") url += "/translate" response = requests.get( url=url, json={"text": text, "dest_lang": dest_lang} ).json() return response.get("text", None)
def t_find_pdf_primarily(self, pub_id: str, title: str, authors: list, url: str): resd = {"status": "ok"} if url: files_path = get_config("FILES_PATH") file_name = md5(url.encode("utf-8")).hexdigest() if not os.path.exists(files_path): os.makedirs(files_path) pdf_raw = download(url) full_path = f'{files_path}{os.path.sep}{file_name}.pdf' with open(full_path, "wb+") as f: f.write(pdf_raw) resd["path"] = full_path try: content = extract_text_from_pdf(full_path) except Exception as e: resd["extraction_failure"] = str(e) logger.debug(e) content = None update_result = update_one("publication", { "filter": {"id": {"$eq": pub_id}}, "update": { "$set": { "raw_base64": base64.encodebytes(pdf_raw).decode("utf-8"), "content": content } }, "upsert": True }) logger.info(f'Update Result: {update_result}') t_elasticsearch_indexing.apply_async((pub_id,)) else: authors = find("author", { "filter": {"id": {"$in": authors}}, "projection": {"name": 1} }) t_find_pdf_secondarily.apply_async( (pub_id, title, [a["name"] for a in authors]) ) return resd
def translate(text, dest_lang): url = get_config("VECTORIZER") url += "/translate" try: response = requests.get( url=url, json={"text": text, "dest_lang": dest_lang} ).json().get("text", None) except Exception: response = False return response
def update_vector(index, _id, vector, rcoef, relevance): logger.info(f'{type(relevance)}: {relevance}') sign = "+" if str(relevance).strip().lower() == "true" else "-" inline = "for (int i=0; i<ctx._source.vector.length; ++i){ctx._source.vector[i]=(ctx._source.vector[i]" + sign + "(params.vector[i]*params.rcoef))/2}" q = { "script": { "lang": "painless", "params": { "vector": list(vector), "rcoef": rcoef }, "inline": inline } } response = rq.post(get_config("ELASTICSEARCH") + f'/{index}/_update/{_id}', json=q).json() logger.info(response) return response
import requests
import base64
import json
def t_elasticsearch_indexing(self, pub_id: str): resd = {"status": "ok"} pub = find_one("publication", { "filter": {"id": {"$eq": pub_id}, "vector": {"$exists": True}} }) pub["authors"] = find("author", { "filter": {"id": {"$in": pub.get("authors", [])}}, "projection": ["id", "name", "affiliation", "citedby", "interests", "organizations"] }) pub.pop("created_at", None) pub.pop("raw_base64", None) pub.pop("title_md5", None) pub.pop("_id", None) pub_id = pub.pop("id") vector_field_tokens = list() if pub.get("content", None): vector_field_tokens += pub["content"].split() if not pub["title"].startswith("unk_"): vector_field_tokens += pub["title"].split() vector_field = " ".join(vector_field_tokens) vectorizer_response = get_vector(preprocess_text(vector_field)) pub["lang"] = vectorizer_response["lang"] pub["vector"] = vectorizer_response["vector"] langs = get_config("LANGUAGES") for lang in langs: if lang != pub["lang"]: pub[f'title_{lang}'] = preprocess_text( translate(pub["title"], lang) or "" ) if str(pub.get("content", None)).strip().lower() not in ["none", ""]: pub[f'content_{lang}'] = preprocess_text( translate(pub["content"], lang) or "" ) else: pub[f'title_{lang}'] = preprocess_text(pub["title"]) pub[f'content_{lang}'] = preprocess_text(pub.get("content", "") or "") if "title" in pub: del pub["title"] if "content" in pub: del pub["content"] update_one("publication", { "filter": {"id": {"$eq": pub_id}}, "update": {"$set": {"vector": pub["vector"], "lang": pub["lang"]}} }) for lang in langs: publication_mappings["properties"][f'title_{lang}'] = {"type": "text"} publication_mappings["properties"][f'content_{lang}'] = {"type": "text"} resp = rq.put( get_config("ELASTICSEARCH") + "/publication", json={"mappings": publication_mappings} ) if resp.status_code == 400: resp = rq.put( get_config("ELASTICSEARCH") + "/publication/_mappings", json=publication_mappings ) logger.info(f'Mapping Response: {resp.json()}') # resp = es.indices.create( # index="publication", # body={"mappings": publication_mappings}, # ignore=400 # ) result = es.index(index="publication", body=pub, id=pub_id) resd["result"] = result return resd
def t_find_pdf_secondarily(self, pub_id: str, title: str, authors: list): resd = {"status": "ok"} try: # Her authoru tek tek kontrol etmemizi sağlayan for döngüsü for single_author in authors: # author için istek atıyoruz http = urllib3.PoolManager() response = http.request( 'GET', 'https://libgen.is/scimag/?q=' + single_author) html_text = response.data soup = BeautifulSoup(html_text, 'html.parser') # arama sonucunda data döndü mü onu kontrol ediyoruz try: total_value = str( soup.find('div', attrs={ 'style': 'float:left' }).getText()).split(" ")[0] except Exception: total_value = 0 # eğer arama sonucunda bir data dönmedi ise diğer yazare # geçmesi için continue diyoruz döngüye if total_value == 0: continue # burada sayfa sayısını hesaplıyoruz. double ile bölmede kalan # muhabbetlerinden ötürü kontrol yapıp gerekliyse # toplam sayfa sayısına bir ekliyoruz en son sayfayı ıskalamamak için total_page_dobule = int(total_value) / 25 total_page = int(int(total_value) / 25) if total_page != total_page_dobule: total_page += 1 # Burda bir yazarın sonuçlarını taramak için sayfalarda geziyoruz. # İlk sayfa için yukarıda istek atmıştık 0'dan farklı bir sonuç # sayısı varsa buraya gelmiştik. # bu yüzden ilk sayfa için istek atmıyoruz. # eğer ilk sayfada sonuç bulunmazsa ve sayfa sayısı 1'den büyük # ise döngünün en sonunda istek atıyor # ve döngü yeni sayfanın içinde arama yapacak şekilde devam ediyor for i in range(total_page): counter = 0 for row in soup.find_all('tr'): if counter == 0: # For initial row. Because it contains table information of page counter += 1 continue row_item = row.find_all('td') row_title = row_item[1].find_all('a')[0].text ratio = fuzz.ratio(row_title.lower(), title.lower( )) # row title ve verilen title benzer mi diye bakılıyor if ratio > 75: url_for_get = row_item[4].find_all('li') href = url_for_get[1].find_all('a', href=True)[0]['href'] response_for_pdf = http.request('GET', href) pdf_page = BeautifulSoup(response_for_pdf.data, 'html.parser') pdf_url = pdf_page.find_all( 'td', {'align': 'center'})[0].find_all( 'a', href=True)[0]['href'] pdf_raw = download(pdf_url) files_path = get_config("FILES_PATH") if not os.path.exists(files_path): os.makedirs(files_path) file_name = md5(pdf_url.encode("utf-8")).hexdigest() full_path = f'{files_path}{os.path.sep}{file_name}.pdf' with open(full_path, "wb+") as f: f.write(pdf_raw) resd["path"] = full_path try: content = extract_text_from_pdf(full_path) except Exception as e: resd["extraction_failure"] = str(e) logger.debug(e) content = None update_one( "publication", { "filter": { "id": { "$eq": pub_id } }, "update": { "$set": { "raw_base64": base64.encodebytes(pdf_raw).decode( "utf-8"), "content": content } }, "upsert": True }) if content: logger.info(f'Content is added to publication.') t_elasticsearch_indexing.apply_async((pub_id, )) return resd if total_page > 1: response = http.request( 'GET', 'https://libgen.is/scimag/?q=' + single_author + '&page=' + str(i + 2)) html_text = response.data soup = BeautifulSoup(html_text, 'html.parser') except Exception as e: logger.exception(e) t_elasticsearch_indexing.apply_async((pub_id, )) return resd
import requests