Example #1
0
def arxiv_cleanup(working_folder,
                  earliest_date=None,
                  latest_date=None,
                  remove_columns=None):
    """
    Cleans the crawl results from arxiv.

    :param working_folder: Folder containing the files
    :type working_folder: str
    :param remove_columns: Columns to be removed from the crawled dataframe. If none given, default is to remove
                           [u'abstract', u'affiliations',u'link_abstract', u'link_doi', u'link_pdf',u'comment']
    :type remove_columns: list of str
    :param earliest_date: Articles before this date are removed
    :type earliest_date: datetime
    :param latest_date: Artivles after this date are removed
    :type latest_date: datetime

    :return: None
    """

    config = logging_confdict(working_folder, __name__ + "_cleanup")
    logging.config.dictConfig(config)
    arxiv_logger = logging.getLogger(__name__ + "_cleanup")

    # Read in stage_1 raw file
    try:
        stage_1_raw = pd.read_json(working_folder + "/stage_1_raw.json")
    except Exception, e:
        arxiv_logger.exception("Could not load stage_1_raw file. Exiting...")
        sys.exit("Could not load stage_1_raw file")
Example #2
0
def crossref_cleanup(working_folder,
                     earliest_date=None,
                     latest_date=None,
                     remove_columns=None):
    """
    Cleans the crawl results from crossref.

    :param working_folder: Folder containing the files
    :type working_folder: str
    :param remove_columns: Columns to be removed from the crawled dataframe. If none given, default is None
    :type remove_columns: list of str
    :param earliest_date: Articles before this date are removed
    :type earliest_date: datetime
    :param latest_date: Articles after this date are removed
    :type latest_date: datetime

    :return: None
    """

    config = logging_confdict(working_folder, __name__ + "_cleanup")
    logging.config.dictConfig(config)
    cr_logger = logging.getLogger(__name__ + "_cleanup")

    # Read in stage_1 raw file
    try:
        stage_2_raw = pd.read_json(working_folder + "/stage_2_raw.json")
    except Exception, e:
        cr_logger.exception("Could not load stage_1_raw file")
        sys.exit("Could not load stage 2 raw")
Example #3
0
def arxiv_crawl(crawling_list,
                limit=None,
                batchsize=100,
                submission_range=None,
                update_range=None,
                delay=None):
    """
    This is a python wrapper for the aRxiv "arxiv_search" function.

    If submission_range or update_range are given, the results are filtered according to the date ranges.

    :param crawling_list: The subcategories to crawl. NOT "stat" -> USE "stat.AP" etc...
    :type crawling_list: dict of lists.
    :param limit: Max number of results to return.
    :type limit: int.
    :param batchsize: Number of queries per request.
    :type batchsize: int.
    :param submission_range: The range of submission dates.
    :type submission_range: Tuple (start,end).
    :param update_range: The range of last-update dates.
    :type update_range: Tuple (start,end).

    :returns:  The created folder
    """

    # Timestamp of starting datetime
    ts_start = time.time()
    timestamp = datetime.datetime.fromtimestamp(ts_start).strftime(
        '%Y-%m-%d_%H-%M-%S')

    # Create folder structure
    working_folder = base_directory + timestamp
    os.makedirs(working_folder)
    os.makedirs(working_folder + "/temp_files")

    # Setup logging
    config = logging_confdict(working_folder, __name__)
    logging.config.dictConfig(config)
    arxiv_logger = logging.getLogger(__name__)

    arxiv_logger.info("Starting new crawl for {}".format(str(crawling_list)))
    arxiv_logger.info("Created new folder: <<" + working_folder + ">>")

    # Load R-scripts
    arxiv_logger.debug("Loading R-Scripts ...")
    try:
        with open('../r_scripts/arxiv.R', 'r') as f:
            string = ''.join(f.readlines())
        arxiv_crawler = SignatureTranslatedAnonymousPackage(
            string, "arxiv_crawler")
    except Exception, e:
        arxiv_logger.exception("Error while loading R-Scripts.")
        sys.exit('Could not load R-Scripts!')
Example #4
0
def test_merge(timestamp):
    """
    Call manually if automatic merging of json files fails.

    :param timestamp: The timestamp of the crawl process that failed to merge the temporary json
    :return: <str> - Working folder
    """

    working_folder = base_directory + timestamp
    config = logging_confdict(working_folder, __name__)
    logging.config.dictConfig(config)
    arxiv_logger = logging.getLogger(__name__)

    from path import Path

    temp_files = list(Path(working_folder + "/temp_files/").files("*.json"))

    try:
        temp_jsons = []

        for idx, temp_file in enumerate(temp_files):
            arxiv_logger.debug(temp_file)
            with open(temp_file) as data_file:
                temp = json.load(data_file)
            temp_jsons.append(temp)

        temp_json = temp_jsons[0]
        for d in temp_jsons[1:-1]:
            for key, val_dict in d.items():
                new_dict = {}
                offset = len(temp_json[key].values())
                for doc_id in val_dict.keys():
                    new_doc_id = offset + int(doc_id)
                    new_dict[new_doc_id] = val_dict.pop(doc_id)
                temp_json[key].update(new_dict)
            print("Length of concatenated dataset: ",
                  len(temp_json['id'].keys()))

        result_df = pd.DataFrame.from_dict(temp_json)

        result_df.index = range(0, len(result_df.index))
        result_df = result_df.fillna(np.nan)

        result_df.to_json(working_folder + "/stage_1_raw.json")
    except:
        arxiv_logger.exception(
            "Error during concatenation of temporary objects")

    return working_folder
Example #5
0
def crossref_crawl(num_processes=1,
                   num_threads=1,
                   input_folder=None,
                   continue_folder=None):
    """
    DOI Lookup interfaces to different DOI providers.
    Currently implemented: CrossRef.
    To-Do: DataCite

    Stage 1 dataset is split into equally sized subframes. Each is given to a subprocess that accesses the
    crossref API with multiple threads.
    Possible candidate documents are matched with original arxiv-documents using Levenshtein Ratio (Schloegl et al)
    
    :param num_processes: Number of processes to split the initial stage_1_dataset
    :param num_threads: Number of threads each process uses to access crossref API
    :param input_folder: The folder containing the stage 1 data. If not given, the most recent folder will be used to work
    :returns: pd.DataFrame - newly found DOIs with original indices
    """
    ts_start = time.time()
    timestamp = datetime.datetime.fromtimestamp(ts_start).strftime(
        '%Y-%m-%d_%H-%M-%S')

    # Create folder structure
    if not input_folder:
        all_subdirs = [d for d in Path(base_directory).listdir() if d.isdir()]
        latest_subdir = max(all_subdirs, key=Path.getmtime)
        base_folder = latest_subdir + "/"
    else:
        base_folder = input_folder
        if base_folder[-1] != "/":
            base_folder += "/"

    if continue_folder:
        working_folder = continue_folder
        temp_folder = working_folder + "/temp/"
    else:
        working_folder = base_folder + timestamp
        temp_folder = working_folder + "/temp/"
        Path(working_folder).mkdir()
        Path(temp_folder).mkdir()

    skip_indices = set()
    if continue_folder:
        # Setup logging
        config = logging_confdict(working_folder, __name__)
        logging.config.dictConfig(config)
        cr_logger = logging.getLogger(__name__)

        cr_logger.info("Continuing crawl in <<" + working_folder + ">>")

        for temp_file in Path(temp_folder).files("*.csv"):
            with open(temp_file, "rb") as tempfile:
                r = csv.reader(tempfile, delimiter=";")
                for line in r:
                    if len(line) == 6:
                        if line[-1] == "False" or line[-1] == "True":
                            skip_indices.add(int(line[0]))

    else:
        # Setup logging
        config = logging_confdict(working_folder, __name__)
        logging.config.dictConfig(config)
        cr_logger = logging.getLogger(__name__)

        cr_logger.info("\nCreated new folder: <<" + working_folder + ">>")

    # Read in stage 1 file
    cr_logger.debug("\nReading in stage_1.json ... (Might take a few seconds)")
    try:
        stage_1 = pd.read_json(base_folder + "/stage_1.json")
    except:
        cr_logger.exception("Problem occured while reading ")
        sys.exit("Could not read stage_1 file")

    stage_1.sort_index(inplace=True)
    stage_1['submitted'] = pd.to_datetime(stage_1['submitted'], unit="ms")

    stage_1.index = range(0, len(stage_1.index))

    crawl_stage_1 = stage_1.drop(skip_indices)

    cr_logger.info(
        "\nSpawning {} processes - output will be cluttered... :S\n".format(
            num_processes))
    # Split df into n sub-dataframes for n processes
    df_ranges = range(0, len(crawl_stage_1.index),
                      len(crawl_stage_1.index) // num_processes + 1)
    df_ranges = df_ranges + [len(crawl_stage_1.index)]
    pool_args = []
    if len(df_ranges) == 1:
        indices = []
        authors = []
        titles = []
        submitted = []
        pool_args.append([indices, authors, titles, submitted])
    else:
        for idx in range(num_processes):
            cr_logger.info("Starting process {}".format(idx))
            indices = crawl_stage_1.iloc[range(
                df_ranges[idx], df_ranges[idx + 1])].index.values
            authors = crawl_stage_1.iloc[range(df_ranges[idx],
                                               df_ranges[idx + 1])].authors
            titles = crawl_stage_1.iloc[range(df_ranges[idx],
                                              df_ranges[idx + 1])].title
            submitted = crawl_stage_1.iloc[range(df_ranges[idx],
                                                 df_ranges[idx + 1])].submitted
            pool_args.append([indices, authors, titles, submitted])

    pool = mp.Pool(processes=num_processes)
    for x in pool_args:
        pool.apply_async(crossref_lookup,
                         args=(working_folder, x[0], x[1], x[2], x[3],
                               num_threads))

    pool.close()
    pool.join()

    cr_logger.info("All processes finished")

    output = []
    for temp_file in Path(temp_folder).files("*.csv"):
        with open(temp_file, "rb") as tempfile:
            r = csv.reader(tempfile, delimiter=";")
            for line in r:
                if len(line) == 6:
                    result = {
                        'idx': int(line[0]),
                        'cr_title': line[1],
                        'cr_doi': line[3],
                        'lr': line[4]
                    }
                    if line[-1] == "False":
                        result['cr_title'] = np.nan
                        result['cr_doi'] = np.nan
                    output.append(result)

    cr_data = pd.DataFrame(output)
    cr_data = cr_data.set_index("idx", drop=True)

    cr_logger.info("\nMerging stage_1 dataset and crossref results")

    stage_2_raw = pd.merge(stage_1,
                           cr_data,
                           left_index=True,
                           right_index=True,
                           how='left')
    print(stage_2_raw)
    stage_2_raw.sort_index(inplace=True)

    try:
        stage_2_raw.to_json(working_folder + "/stage_2_raw.json")
        stage_2_raw.to_csv(working_folder + "/stage_2_raw.csv",
                           encoding="utf-8",
                           sep=Config.get("csv", "sep_char"),
                           index=False)
    except Exception, e:
        cr_logger.exception("Could not write all output files")
Example #6
0
def mendeley_crawl(stage1_dir=None, stage2_dir=None, num_threads=1):
    """
    Retrieve mendeley documents based on arxiv id and dois.
    If both arxiv and doi yield different mendeley documents the one with more identifiers is preferred.

    :param stage1_dir: The name of the Stage 1 folder to use. If None last created will be used
    :param stage2_dir: The name of the Stage 2 folder to use. If None last created will be used
    :param num_threads: Number of threads to use
    :return: working_folder as absolute path
    """

    ts_start = time.time()
    timestamp = datetime.datetime.fromtimestamp(ts_start).strftime('%Y-%m-%d_%H-%M-%S')

    # Start mendeley session
    session = start_mendeley_session(Config._sections['mndly_auth'])

    # Create folder structure
    if not stage1_dir:
        all_subdirs = [base_directory + d for d in os.listdir(base_directory) if os.path.isdir(base_directory + d)]
        latest_subdir = max(all_subdirs, key=os.path.getmtime)
        stage1_dir = latest_subdir + "/"
    else:
        stage1_dir += "/"

    if not stage2_dir:
        all_subdirs = [stage1_dir + d for d in os.listdir(stage1_dir) if os.path.isdir(stage1_dir + d)]
        latest_subdir = max(all_subdirs, key=os.path.getmtime)
        stage2_dir = latest_subdir + "/"
    else:
        stage2_dir = stage1_dir + stage2_dir + "/"

    working_folder = stage2_dir + timestamp
    if not os.path.exists(working_folder):
        os.makedirs(working_folder)

    # Create logger
    config = logging_confdict(working_folder, __name__)
    logging.config.dictConfig(config)
    logger = logging.getLogger(__name__)

    # Read in stage 2 file
    input_df = pd.read_json(stage2_dir + "stage_2.json")
    input_df.sort_index(inplace=True)

    input_q = Queue.Queue()
    output_q = Queue.Queue()

    for idx, row in input_df.iterrows():
        input_q.put((idx, row))

    mndly_threads = []
    for i in range(0, num_threads):
        thread = MendeleyThread(logger, input_q, output_q, len(input_df.index), session)
        thread.start()
        mndly_threads.append(thread)

    for thread in mndly_threads:
        thread.join()

    output_dicts = []
    while not output_q.empty():
        output_dicts.append(output_q.get_nowait())

    # ================= TEMPORARY HACK ==============
    arxiv_ids = []
    for original_arxiv in input_df['id'].values:
        found_regex = regex_new_arxiv.findall(original_arxiv)
        if found_regex:
            arxiv_id = found_regex[0]
        else:
            found_regex = regex_old_arxiv.findall(original_arxiv)
            if found_regex:
                arxiv_id = found_regex[0]
            else:
                arxiv_id = "parse_failed"
        arxiv_ids.append(arxiv_id)
    input_df['arxiv_id'] = pd.Series(arxiv_ids, index=input_df.index)
    #  ================= TEMPORARY HACK ==============

    stage_3_raw = pd.DataFrame(output_dicts)
    stage_3_raw = pd.merge(left=input_df,
                           right=stage_3_raw,
                           left_on="arxiv_id",
                           right_on="arxiv_id",
                           how="outer")

    stage_3_raw['submitted'] = pd.to_datetime(stage_3_raw['submitted'], unit="ms")
    stage_3_raw['updated'] = pd.to_datetime(stage_3_raw['updated'], unit="ms")

    del stage_3_raw['abstract']

    try:
        stage_3_raw.to_json(working_folder + "/stage_3_raw.json")
        stage_3_raw.to_csv(working_folder + "/stage_3_raw.csv", encoding="utf-8",
                           sep=Config.get("csv", "sep_char"), index=False)
    except Exception, e:
        logger.exception("Could not write all output files")
Example #7
0
def ads_crawl_category(list_of_cats, number_of_docs=100, num_threads=1):
    """

    :param list_of_cats: <list> - Categories to crawl
    :param number_of_docs: <int> - Number of docs to crawl.
    :param num_threads: <int> - Number of ADS-Crawl threads to start
    :return: <str> - Working folder
    """

    timestamp = arrow.utcnow().to('Europe/Vienna').format('YYYY-MM-DD_HH-mm-ss')

    base_folder = base_directory

    working_folder = base_folder + timestamp
    Path(working_folder).mkdir()

    # Setup logging
    config = logging_confdict(working_folder, __name__)
    logging.config.dictConfig(config)
    ads_logger = logging.getLogger(__name__)

    ads_logger.info("\nCreated new folder: <<" + working_folder + ">>")

    input_queue = Queue.Queue()
    output_queue = Queue.Queue()

    for count, cat in enumerate(list_of_cats):
        payload = {'q': 'arxiv_class:"{}"'.format(cat), 'sort': 'read_count desc',
                   'fl': 'reader,title,abstract,'
                         'year,author,pub,read_count,'
                         'citation_count,identifier,arxiv_class,'
                         'primary_arxiv_class,arxiv_primary_class,'
                         'primary_class',
                   'rows': number_of_docs}

        input_queue.put((count, payload))

    threads = []
    for i in range(num_threads):
        thread = ADSThread(input_queue, output_queue, ads_logger)
        thread.start()
        threads.append(thread)

    ads_logger.debug("THREADING STARTED - PLEASE BE PATIENT")

    for thread in threads:
        thread.join()

    rows = []
    while not output_queue.empty():
        temp = output_queue.get_nowait()
        for doc in temp:
            # doc['url'] = "http://arxiv.org/abs/" + cat
            try:
                doc['authors'] = ";".join(doc['author'])
                del doc['author']
            except KeyError:
                doc['authors'] = []

            if 'reader' not in doc:
                doc['reader'] = []

            doc['readers'] = int(doc['read_count'])
            doc['reader_ids'] = u";".join(doc['reader'])
            doc['title'] = doc['title'][0]

            del doc['read_count']
            del doc['reader']
            rows.append(doc)

    # Convert to pandas dataframe
    df = pd.DataFrame(rows)

    # Rename columns
    df.rename(columns={'pub': 'published_in', 'abstract': 'paper_abstract'}, inplace=True)
    df.index.name = "id"

    # Output
    # ads_logger.debug("SAVING FILE")
    # df.to_csv(working_folder + "/ads_data.csv", sep=";", encoding='utf8', index=False)
    # df.to_json(working_folder + "/ads_data.json")

    return working_folder
Example #8
0
def ads_crawl_dataset(input_folder=None, number_of_docs=100, num_threads=1):
    """
    Uses an existing dataframe containing arxiv_id's to crawl corresponding ADS data.
    Always uses the top *number_of_docs* documents for the search.

    :param input_folder: Input folder
    :param number_of_docs: Number of documents to use
    :param num_threads: Number of threads
    :return: Newly created working folder
    """
    timestamp = arrow.utcnow().to('Europe/Vienna').format('YYYY-MM-DD_HH-mm-ss')

    # Create folder structure
    if not input_folder:
        all_subdirs = [d for d in Path(base_directory).listdir() if d.isdir()]
        latest_subdir = max(all_subdirs, key=Path.getmtime)
        base_folder = latest_subdir + "/"
    else:
        # base_folder = base_directory + input_folder
        base_folder = input_folder
        if base_folder[-1] != "/":
            base_folder += "/"

    working_folder = base_folder + timestamp
    Path(working_folder).mkdir()

    # Setup logging
    config = logging_confdict(working_folder, __name__)
    logging.config.dictConfig(config)
    ads_logger = logging.getLogger(__name__)

    ads_logger.info("\nCreated new folder: <<" + working_folder + ">>")

    # Read in stage 1 file
    ads_logger.debug("\nReading in stage_3_raw.json ... (Might take a few seconds)")
    try:
        df = pd.read_json(base_folder + "/stage_3_raw.json")
    except IOError:
        ads_logger.exception("stage_3_raw.json does not exist")
        sys.exit()

    df.sort(columns="reader_count", ascending=False, inplace=True)
    df.index = range(0, len(df.index))

    arxiv_ids = df['arxiv_id'][0:number_of_docs].tolist()

    input_queue = Queue.Queue()
    output_queue = Queue.Queue()

    for count, arxiv_id in enumerate(arxiv_ids):
        found_regex = regex_new_arxiv.findall(arxiv_id)
        if found_regex:
            arxiv_id = found_regex[0]
        else:
            found_regex = regex_old_arxiv.findall(arxiv_id)
            if found_regex:
                arxiv_id = found_regex[0]

        payload = {'q': 'arXiv:{}'.format(arxiv_id), 'sort': 'read_count desc'}

        input_queue.put((count, payload))

    threads = []
    for i in range(num_threads):
        thread = ADSThread(input_queue, output_queue, ads_logger)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    rows = []
    while not output_queue.empty():
        temp = output_queue.get_nowait()[0]
        temp['url'] = "http://arxiv.org/abs/" + "none_currently"
        try:
            temp['authors'] = ";".join(temp['author'])
            del temp['author']
        except KeyError:
            temp['authors'] = []

        if 'reader' not in temp:
            temp['reader'] = []

        temp['readers'] = int(temp['read_count'])
        temp['reader_ids'] = u";".join(temp['reader'])
        temp['title'] = temp['title'][0]

        del temp['read_count']
        del temp['reader']
        rows.append(temp)

    # Convert to pandas dataframe
    df = pd.DataFrame(rows)

    # Rename columns
    df.rename(columns={'pub': 'published_in', 'abstract': 'paper_abstract'}, inplace=True)
    df.index.name = "id"

    # Output
    df.to_csv(working_folder + "/ads_data.csv", Config.get("csv", "sep_char"),
              encoding='utf8', index=False)
    df.to_json(working_folder + "/ads_data.json")

    return working_folder