Ejemplo n.º 1
0
def create_author_profile(db_config_filename="db_config.json"):
    """
    Get all the papers written by the all the users in our database
    This connects to the Semantic Scholar API (`/author` end point)
    and gets information like Citation Velocity and all the papers written
    by the author and writes it to a separate MONGODB collection 
    

    :param db_config_filename:  DB configuration file
    :type db_config_filename: str
    """
    db = DB(config_filename=db_config_filename)
    author_id_name_collection = db.get_author_id_name_collection()
    author_profile_collection = db.get_author_profile_collection()

    author_id_name_docs = author_id_name_collection.find(no_cursor_timeout=True)
    for author_id_name in tqdm(author_id_name_docs,
                               total=author_id_name_collection.count()):
        author_id = author_id_name['s2_id']

        # Skip if the author's profile is already present
        if author_profile_collection.find_one({'authorID': author_id}):
            continue

        response = requests.get(SS_AUTHOR_API + author_id)
        if response.status_code == requests.codes.ok:
            response_json = response.json()
            author_profile_collection.insert_one(response_json)

        # sleep before making further api calls
        time.sleep(1)
    def test_author_profile_collection_is_filled(self):
        """
        Test whether the author_profile collections is filled
        """
        db = DB()
        author_profile_collection = db.get_author_profile_collection()

        assert author_profile_collection.count() > 0
    def test_author_id_collection_is_filled(self):
        """
        Make sure that author_id -> author_name MONGODB collection is getting filled
        """
        db = DB()
        collection = db.get_author_id_name_collection()
        create_authors_id_mapping()

        assert collection.count() > 0
 def test_author_id_is_not_none(self):
     """
     Test to make sure that all the author ids are valid in the author_id -> author_name
     collection
     """
     db = DB()
     collection = db.get_author_id_name_collection()
     author_id_names = collection.find()
     author_ids = [doc['s2_id'] for doc in author_id_names]
     assert None not in author_ids
    def test_preprocessed_titles_is_filled(self):
        """
        Test whehter the titles_preprocessed collection is not empty
        """
        db = DB()
        pre_processed_collection = db.get_titles_preprocessed_collection();

        create_preprocessed_titles()

        assert pre_processed_collection.count() > 0
Ejemplo n.º 6
0
def populate_db(corpus_filepath):
    if not os.path.isfile(corpus_filepath):
        raise FileNotFoundError(
            "We could not find the path of the corpus file that you specified")

    db = DB()
    papers_collection = db.get_papers_collection()
    with open(corpus_filepath, 'r') as fp:
        for doc in fp:
            json_doc = json.loads(doc)
            papers_collection.insert_one(json_doc)

    create_authors_id_mapping()
    create_author_profile()
    create_preprocessed_titles()
Ejemplo n.º 7
0
 def test_no_config_file_found(self):
     """
     If there is no config file, the class should throw a FileNotFound Error
     """
     dummy_config_filename = "dummy.json"
     with pytest.raises(FileNotFoundError):
         db = DB(config_filename=dummy_config_filename)
Ejemplo n.º 8
0
def get_authors():
    """
    /get_authors Gets all the authors that are in our database

    """
    db = DB()
    author_id_name_mapping_collection = db.get_author_id_name_collection()
    authors = author_id_name_mapping_collection.find()

    data = [{
        'name': author['s2_name'],
        'id': author['s2_id']
    } for author in authors]

    response = jsonify({"data": data, "status": 200})
    response.status_code = 200

    return response
Ejemplo n.º 9
0
def create_preprocessed_titles(db_config_filename="db_config.json"):
    """
    This is used to preprocess the titles of all the papers belonging to the author

    :param db_config_filename: DB configuration file
    :type db_config_filename: str
    """
    nlp = spacy.load('en')
    db = DB(config_filename=db_config_filename)
    author_profile_collection = db.get_author_profile_collection()
    titles_preprocessed_collection = db.get_titles_preprocessed_collection()
    author_profiles = author_profile_collection.find()

    for author_profile in tqdm(author_profiles, total=author_profile_collection.count(),
                               desc="Pre processing author paper titles"):
        s2_id = author_profile['authorId']
        papers = author_profile['papers']
        if titles_preprocessed_collection.find_one({'authorId': s2_id}):
            continue
        for paper in papers:
            title = paper['title']
            year = paper['year']

            if not year:
                continue

            # Pre-processing
            title = title.strip()
            title = title.replace("\n", "")
            title = title.replace("\t", " ")
            title = title.lower()

            # remove stop words
            doc = nlp(title)
            tokens = [token for token in doc]
            title = filter(lambda token: not token.is_stop, tokens)
            title = [token.text for token in list(title)]
            title = ' '.join(title)

            titles_preprocessed_collection.insert_one({
                "title": title,
                "year": year,
                "authorId": s2_id
            })
Ejemplo n.º 10
0
    def test_no_port_number_in_config_file(self):
        """
            If there is no db key in the config file then it should throw a ValueErro
        """
        with open(os.path.join(CONFIG_DIR, "dummy_config.json"), 'w') as fp:
            json.dump({'db': 'scholar_cloud_test'}, fp)

        with pytest.raises(ValueError):
            db = DB(config_filename="dummy_config.json")

        os.remove(os.path.join(CONFIG_DIR, "dummy_config.json"))
Ejemplo n.º 11
0
    def test_no_db_found_in_config_file(self):
        """
        If there is no db key in the config file then it should throw a ValueErro
        """
        with open(os.path.join(CONFIG_DIR, "dummy_config.json"), 'w') as fp:
            json.dump({'blah': 1}, fp)

        with pytest.raises(ValueError):
            db = DB(config_filename="dummy_config.json")

        os.remove(os.path.join(CONFIG_DIR, "dummy_config.json"))
Ejemplo n.º 12
0
def create_authors_id_mapping(db_config_filename="db_config.json"):
    """
       Reads the data from MONGODB and
       creates another collection that contains the mapping from
       author_id -> author_mapping

       * *This method ideally has to be called only once.*
       * *If you call it multiple times, the MONGODB collection will be udpated with the new ids
         and names.*

       :param db_config_filename: DB configuration filename
       :type db_config_filename: str
    """

    db = DB(config_filename=db_config_filename)
    papers_collection = db.get_papers_collection()
    author_id_name_collection = db.get_author_id_name_collection()
    papers = papers_collection.find()

    author_id_mapping = {}
    for paper in papers:
        authors = paper['authors']
        for author in authors:
            author_name = author['name']
            ids = author['ids']  # We will always take the first id found
            if not len(ids) > 0:
                continue
            else:
                author_id = ids[0]

            # Here we overwrite if the id is already there. We do not check
            # whether the id already exists in the dictionary
            if not author_id_name_collection.find_one({'s2_id': author_id}):
                author_id_mapping[author_id] = author_name

    author_id_collection = [{'s2_id': author_id, 's2_name': author_name}
                            for author_id, author_name in author_id_mapping.items()]

    if len(author_id_collection) > 0:
        author_id_name_collection.insert_many(author_id_collection)
Ejemplo n.º 13
0
    def test_get_papers_collection_throws_error(self):
        db = DB()
        config = db.read_db_config()
        del config['s2_papers_collection']
        db.config = config

        with pytest.raises(ValueError):
            db.get_papers_collection()
Ejemplo n.º 14
0
    def test_get_titles_preprocessed_collection_throws_error(self):
        db = DB()
        config = db.read_db_config()
        del config['titles_preprocessed_collection']
        db.config = config

        with pytest.raises(ValueError):
            db.get_titles_preprocessed_collection()
Ejemplo n.º 15
0
    def test_get_author_profile_throws_error(self):
        db = DB()
        config = db.read_db_config()
        del config['author_profile_collection']
        db.config = config

        with pytest.raises(ValueError):
            db.get_author_profile_collection()
Ejemplo n.º 16
0
from flask import Flask
from flask import request
from flask import jsonify
from flask_cors import CORS
from scholar_cloud_backend.db.db import DB
from scholar_cloud_backend.utils.data_utils import get_tfidf
import operator

app_name = 'scholar_cloud'
app = Flask(app_name)
CORS(app)
db = DB()


@app.route('/top_words/')
def top_words():
    """
    /top_words/ Generates the top words for a given author and
    between given years.

    Request should contain the following parameters


    * author_id
    * from_year
    * to_year
    * limit - Number of top words to return

    """
    author_id = request.args.get('author_id', None)
    from_year = request.args.get('from_year', None)
Ejemplo n.º 17
0
 def test_get_titles_preprocessed_collection_returns_collection(self):
     db = DB()
     assert type(db.get_titles_preprocessed_collection()
                 ) == pymongo.collection.Collection
Ejemplo n.º 18
0
    def test_read_db_config_returns_dict(self):

        db = DB()
        assert type(db.read_db_config()) == dict
Ejemplo n.º 19
0
 def test_get_papers_collection_returns_collection(self):
     db = DB()
     assert type(
         db.get_papers_collection()) == pymongo.collection.Collection
Ejemplo n.º 20
0
 def test_author_profile_collection_returns_collection(self):
     db = DB()
     assert type(db.get_author_profile_collection()
                 ) == pymongo.collection.Collection