def create_author_profile(db_config_filename="db_config.json"): """ Get all the papers written by the all the users in our database This connects to the Semantic Scholar API (`/author` end point) and gets information like Citation Velocity and all the papers written by the author and writes it to a separate MONGODB collection :param db_config_filename: DB configuration file :type db_config_filename: str """ db = DB(config_filename=db_config_filename) author_id_name_collection = db.get_author_id_name_collection() author_profile_collection = db.get_author_profile_collection() author_id_name_docs = author_id_name_collection.find(no_cursor_timeout=True) for author_id_name in tqdm(author_id_name_docs, total=author_id_name_collection.count()): author_id = author_id_name['s2_id'] # Skip if the author's profile is already present if author_profile_collection.find_one({'authorID': author_id}): continue response = requests.get(SS_AUTHOR_API + author_id) if response.status_code == requests.codes.ok: response_json = response.json() author_profile_collection.insert_one(response_json) # sleep before making further api calls time.sleep(1)
def test_author_profile_collection_is_filled(self): """ Test whether the author_profile collections is filled """ db = DB() author_profile_collection = db.get_author_profile_collection() assert author_profile_collection.count() > 0
def test_author_id_collection_is_filled(self): """ Make sure that author_id -> author_name MONGODB collection is getting filled """ db = DB() collection = db.get_author_id_name_collection() create_authors_id_mapping() assert collection.count() > 0
def test_author_id_is_not_none(self): """ Test to make sure that all the author ids are valid in the author_id -> author_name collection """ db = DB() collection = db.get_author_id_name_collection() author_id_names = collection.find() author_ids = [doc['s2_id'] for doc in author_id_names] assert None not in author_ids
def test_preprocessed_titles_is_filled(self): """ Test whehter the titles_preprocessed collection is not empty """ db = DB() pre_processed_collection = db.get_titles_preprocessed_collection(); create_preprocessed_titles() assert pre_processed_collection.count() > 0
def populate_db(corpus_filepath): if not os.path.isfile(corpus_filepath): raise FileNotFoundError( "We could not find the path of the corpus file that you specified") db = DB() papers_collection = db.get_papers_collection() with open(corpus_filepath, 'r') as fp: for doc in fp: json_doc = json.loads(doc) papers_collection.insert_one(json_doc) create_authors_id_mapping() create_author_profile() create_preprocessed_titles()
def test_no_config_file_found(self): """ If there is no config file, the class should throw a FileNotFound Error """ dummy_config_filename = "dummy.json" with pytest.raises(FileNotFoundError): db = DB(config_filename=dummy_config_filename)
def get_authors(): """ /get_authors Gets all the authors that are in our database """ db = DB() author_id_name_mapping_collection = db.get_author_id_name_collection() authors = author_id_name_mapping_collection.find() data = [{ 'name': author['s2_name'], 'id': author['s2_id'] } for author in authors] response = jsonify({"data": data, "status": 200}) response.status_code = 200 return response
def create_preprocessed_titles(db_config_filename="db_config.json"): """ This is used to preprocess the titles of all the papers belonging to the author :param db_config_filename: DB configuration file :type db_config_filename: str """ nlp = spacy.load('en') db = DB(config_filename=db_config_filename) author_profile_collection = db.get_author_profile_collection() titles_preprocessed_collection = db.get_titles_preprocessed_collection() author_profiles = author_profile_collection.find() for author_profile in tqdm(author_profiles, total=author_profile_collection.count(), desc="Pre processing author paper titles"): s2_id = author_profile['authorId'] papers = author_profile['papers'] if titles_preprocessed_collection.find_one({'authorId': s2_id}): continue for paper in papers: title = paper['title'] year = paper['year'] if not year: continue # Pre-processing title = title.strip() title = title.replace("\n", "") title = title.replace("\t", " ") title = title.lower() # remove stop words doc = nlp(title) tokens = [token for token in doc] title = filter(lambda token: not token.is_stop, tokens) title = [token.text for token in list(title)] title = ' '.join(title) titles_preprocessed_collection.insert_one({ "title": title, "year": year, "authorId": s2_id })
def test_no_port_number_in_config_file(self): """ If there is no db key in the config file then it should throw a ValueErro """ with open(os.path.join(CONFIG_DIR, "dummy_config.json"), 'w') as fp: json.dump({'db': 'scholar_cloud_test'}, fp) with pytest.raises(ValueError): db = DB(config_filename="dummy_config.json") os.remove(os.path.join(CONFIG_DIR, "dummy_config.json"))
def test_no_db_found_in_config_file(self): """ If there is no db key in the config file then it should throw a ValueErro """ with open(os.path.join(CONFIG_DIR, "dummy_config.json"), 'w') as fp: json.dump({'blah': 1}, fp) with pytest.raises(ValueError): db = DB(config_filename="dummy_config.json") os.remove(os.path.join(CONFIG_DIR, "dummy_config.json"))
def create_authors_id_mapping(db_config_filename="db_config.json"): """ Reads the data from MONGODB and creates another collection that contains the mapping from author_id -> author_mapping * *This method ideally has to be called only once.* * *If you call it multiple times, the MONGODB collection will be udpated with the new ids and names.* :param db_config_filename: DB configuration filename :type db_config_filename: str """ db = DB(config_filename=db_config_filename) papers_collection = db.get_papers_collection() author_id_name_collection = db.get_author_id_name_collection() papers = papers_collection.find() author_id_mapping = {} for paper in papers: authors = paper['authors'] for author in authors: author_name = author['name'] ids = author['ids'] # We will always take the first id found if not len(ids) > 0: continue else: author_id = ids[0] # Here we overwrite if the id is already there. We do not check # whether the id already exists in the dictionary if not author_id_name_collection.find_one({'s2_id': author_id}): author_id_mapping[author_id] = author_name author_id_collection = [{'s2_id': author_id, 's2_name': author_name} for author_id, author_name in author_id_mapping.items()] if len(author_id_collection) > 0: author_id_name_collection.insert_many(author_id_collection)
def test_get_papers_collection_throws_error(self): db = DB() config = db.read_db_config() del config['s2_papers_collection'] db.config = config with pytest.raises(ValueError): db.get_papers_collection()
def test_get_titles_preprocessed_collection_throws_error(self): db = DB() config = db.read_db_config() del config['titles_preprocessed_collection'] db.config = config with pytest.raises(ValueError): db.get_titles_preprocessed_collection()
def test_get_author_profile_throws_error(self): db = DB() config = db.read_db_config() del config['author_profile_collection'] db.config = config with pytest.raises(ValueError): db.get_author_profile_collection()
from flask import Flask from flask import request from flask import jsonify from flask_cors import CORS from scholar_cloud_backend.db.db import DB from scholar_cloud_backend.utils.data_utils import get_tfidf import operator app_name = 'scholar_cloud' app = Flask(app_name) CORS(app) db = DB() @app.route('/top_words/') def top_words(): """ /top_words/ Generates the top words for a given author and between given years. Request should contain the following parameters * author_id * from_year * to_year * limit - Number of top words to return """ author_id = request.args.get('author_id', None) from_year = request.args.get('from_year', None)
def test_get_titles_preprocessed_collection_returns_collection(self): db = DB() assert type(db.get_titles_preprocessed_collection() ) == pymongo.collection.Collection
def test_read_db_config_returns_dict(self): db = DB() assert type(db.read_db_config()) == dict
def test_get_papers_collection_returns_collection(self): db = DB() assert type( db.get_papers_collection()) == pymongo.collection.Collection
def test_author_profile_collection_returns_collection(self): db = DB() assert type(db.get_author_profile_collection() ) == pymongo.collection.Collection