def refextract_url(): """Run refextract on a URL.""" if current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"): headers = { "Content-Type": "application/json", "Accept": "application/json" } data = { "journal_kb_data": create_journal_dict(), "url": request.json["url"] } response = requests.post( f"{current_app.config['REFEXTRACT_SERVICE_URL']}/extract_references_from_url", headers=headers, data=orjson.dumps(data), ) if response.status_code != 200: return jsonify({"message": "Can not extract references"}, 500) extracted_references = response.json()["extracted_references"] else: extracted_references = extract_references_from_url( request.json["url"], override_kbs_files={"journals": create_journal_dict()}, reference_format="{title},{volume},{page}", ) deduplicated_extracted_references = dedupe_list(extracted_references) references = map_refextract_to_schema(deduplicated_extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def refextract_url(): """Run refextract on a URL.""" extracted_references = extract_references_from_url( request.json['url'], override_kbs_files=get_refextract_kbs_path(), reference_format=u'{title},{volume},{page}') references = map_refextract_to_schema(extracted_references) return jsonify(references)
def refextract_url(): """Run refextract on a URL.""" extracted_references = extract_references_from_url( request.json["url"], override_kbs_files={"journals": create_journal_dict()}, reference_format="{title},{volume},{page}", ) references = map_refextract_to_schema(extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def refextract_url(): """Run refextract on a URL.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_url( request.json['url'], override_kbs_files=kbs_path, reference_format=u'{title},{volume},{page}' ) references = map_refextract_to_schema(extracted_references) return jsonify(references)
def refextract_url(): """Run refextract on a URL.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_url( request.json["url"], override_kbs_files=kbs_path, reference_format="{title},{volume},{page}", ) references = map_refextract_to_schema(extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def explore_references( url, author, title): #RECURSIVE REFERENCE EXPLORATION FUNCTION (IMPLEMENTED) references = extract_references_from_url(url) #get references from url reverse = reversed( references) #initialize reversed copy of references to iterate over original_length = len(references) #store number of references for deletion i = 0 #initialize counter variable i to keep track of current index in reverse for reference in reverse: #iterate over reversed reference list if not 'author' in list( reference.keys()): #if the reference has no offer (invalid) del references[ original_length - 1 - i] #remove that reference (notice reverse not changed) i += 1 #increment i for reference in references: #for each valid reference true_author = parse(reference['author'][0]) #parse the author next_title = get_title(reference) #get title of reference add_node(true_author, next_title, False) #add new node for that reference's author add_edge( nodes.index(next(node for node in nodes if node["Label"] == author)), nodes.index( next(node for node in nodes if node["Label"] == true_author)) ) #add edge from current node to this reference's new node cont = get_yes_no( "Explore all valid references?" ) #get yes no input for whether program should continue down tree if cont: #if user says program should continue down tree for reference in references: #for each valid reference true_author = parse(reference['author'][0]) #parse the author next_title = get_title(reference) #get title of reference q = get_queries( reference ) #Store list of search queries associated with reference new_url = find_pdf( q ) #Get a pdf url from the list of queries (google search and web scrape) if new_url != "FAILURE": explore_references(new_url, true_author, next_title) #explore new pdf if found else: print("NO PDF FOUND FOR THIS DOCUMENT, MOVING ON") # return #once done exploring all the references as far as desired end the function else: #if shouldn't continue further down tree return #done exploring that node
def main(): if len(sys.argv) < 2: print('usage: extractrefs <pdf_path> [dst_path]') return pdf_path = sys.argv[1] assert pdf_path.endswith('.pdf') dst_path = \ sys.argv[2] if len(sys.argv) > 2 else pdf_path.replace('.pdf', '.json') if pdf_path.startswith('http://') or pdf_path.startswith('https://'): refs = refextract.extract_references_from_url(pdf_path) else: refs = refextract.extract_references_from_file(pdf_path) with open(dst_path, 'w') as f: json.dump(refs, f, indent=4) print('saved refs to %s' % dst_path)
from refextract import extract_references_from_url references = extract_references_from_url( 'https://arxiv.org/pdf/1503.07589.pdf') print(references[0])
def refextract_url(): """Run refextract on a URL.""" extracted_references = extract_references_from_url(request.json['url']) references = map_refextract_to_schema(extracted_references) return jsonify(references)