def get_suggestions(**args): # initializations papers = [] bibcodes = [] if "bibcodes" in args: bibcodes = args["bibcodes"] if len(bibcodes) == 0: return [] # Any overrides for default values? if "Nsuggest" in args: Nsuggestions = args["Nsuggest"] else: Nsuggestions = config.BIBUTILS_DEFAULT_SUGGESTIONS if "fmt" in args: output_format = args["fmt"] else: output_format = config.BIBUTILS_DEFAULT_FORMAT # get rid of potential trailing spaces bibcodes = map(lambda a: a.strip(), bibcodes)[: config.BIBUTILS_MAX_INPUT] # start processing # get the citations for all publications (keeping multiplicity is essential) cit_dict = get_citations(bibcodes=bibcodes, threads=config.BIBUTILS_THREADS) cits = [item for sublist in cit_dict.values() for item in sublist] # clean up cits cits = filter(lambda a: len(a) > 0, cits) # get references refs = get_references(bibcodes=bibcodes) # clean up refs refs = filter(lambda a: len(a) > 0, refs) # removes papers from the original list to get candidates papers = filter(lambda a: a not in bibcodes, cits + refs) # establish frequencies of papers in results paperFreq = [(k, len(list(g))) for k, g in groupby(sorted(papers))] # and sort them, most frequent first paperFreq = sorted(paperFreq, key=operator.itemgetter(1), reverse=True) # remove all papers with frequencies smaller than threshold paperFreq = filter(lambda a: a[1] > config.BIBUTILS_THRESHOLD_FREQUENCY, paperFreq) # get metadata for suggestions meta_dict = get_meta_data(results=paperFreq[:Nsuggestions]) # return results in required format if output_format == "score": return [ {"bibcode": x, "score": y, "title": meta_dict[x]["title"], "author": meta_dict[x]["author"]} for (x, y) in paperFreq[:Nsuggestions] if x in meta_dict.keys() ] else: return [ {"bibcode": x, "score": "NA", "title": meta_dict[x]["title"], "author": meta_dict[x]["author"]} for (x, y) in paperFreq[:Nsuggestions] if x in meta_dict.keys() ]
def get_attributes(args): """ Gather all data necessary for metrics calculations """ solr_url = config.SOLR_URL max_hits = config.METRICS_MAX_HITS threads = config.METRICS_THREADS chunk_size = config.METRICS_CHUNK_SIZE # Get publication information if 'query' in args: # If we were fed a query, gather the associated bibcodes bibcodes = get_publications_from_query(args['query']) elif 'bibcodes' in args: bibcodes = map(lambda a: a.strip(), args['bibcodes']) elif 'libid' in args: # In theory we allow for retrieving bibcodes from private libraries # Clearly this will currently not be used bibcodes = get_bibcodes_from_private_library(args['libid']) # Split the list of bibcodes up in chunks, for parallel processing biblists = list(chunks(bibcodes,chunk_size)) # Gather all publication information into one publication dictionary, # keyed on bibcode publication_data = get_publication_data(biblists=biblists) missing_bibcodes = filter(lambda a: a not in publication_data.keys(), bibcodes) app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes)) bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes) # Get citation dictionaries (all, refereed and non-refereed citations in # separate dictionaries, so that we don't have to figure this out later) (cit_dict,ref_cit_dict,non_ref_cit_dict) = get_citations(bibcodes=bibcodes, pubdata=publication_data, type='metrics') # divide by 4 because the values of the dictionary are 4-tuples # and the flattening removed all structure. Nciting = len(set([x[0] for v in cit_dict.values() for x in v])) Nciting_ref = len(set([x[0] for v in ref_cit_dict.values() for x in v])) # Now gather all usage data numbers from the MongoDB 'adsdata' collection # This info will get stored in the dictionary 'adsdata', also keyed on bibcode ads_data = get_mongo_data(bibcodes=bibcodes) # Generate the list of document attribute vectors and then # sort this list by citations (descending). # The attribute vectors will be used to calculate the metrics attr_list = make_vectors(bibcodes,publication_data,ads_data,cit_dict,ref_cit_dict,non_ref_cit_dict) # We sort the entries in the attribute list on citation count, which # will make e.g. the calculation of 'h' trivial attr_list = sort_list_of_lists(attr_list,2) return attr_list,Nciting,Nciting_ref
import utils import time import os from config import * from tqdm import tqdm if __name__ == "__main__": vocab = utils.get_vocabulary() # loads vocabulary present in system citations = utils.get_citations() # loads citation counts for doc_ids while True: print("Enter a word to search the index for:") x = input() if x in vocab: start_time = time.time() if os.path.exists(("indexes/inverted_index_" + x + ".pbz2")): index = utils.load_index("indexes/inverted_index_" + x) loaded = x else: index = utils.load_index(filename="indexes/inverted_index_" + x[0]) loaded = x[0] end_time = time.time() print(("Took {} seconds to load index " + loaded).format(end_time - start_time)) print(index[x]["doc_frequency"]) # print number of docs term is in for k in list(index[x]["doc_ids"].keys())[:10]: print( k, index[x]["doc_ids"][k], citations[k] ) # print top 10 docs for term, how many times term in doc, and citations of doc else: