Ejemplo n.º 1
0
import logging
from pyhelpers import tools, grobid_mapping
import config

tools.setup_logging()
import pprint
import json


def main(filter: ("filter", "option") = None):
    """

    :param filter: -filter="{'printConf' : 'yes', 'printJournal' : 'yes', 'printColl': 'yes', 'showPdfProg' :'yes', 'showExtractedSents':'yes'}"
    :return:
    """

    filters = json.loads(filter.replace("'", '"'))
    if filter:
        filters = json.loads(filter.replace("'", '"'))
        print("Using filters " + str(filters))
    print(filters)
    db = tools.connect_to_mongo()
    if len(filters) > 0:
        for k, v in filters.items():
            if k == "showPdfProg" and v == "yes":

                result = db.downloads.count({"success": True})
                print('{:>25} {:>8d}'.format("PDFs downloaded", result))
                #
                result = db.downloads.count({"success": False})
                print('{:>25} {:>8d}'.format("broken DBLP links", result))
Ejemplo n.º 2
0
import logging
import requests
from pyhelpers import tools, grobid_mapping
tools.setup_logging(file_name="extractor.log")
import config as cfg
from lxml import etree
from six import text_type
import os

# mdocker pull lfoppiano/grobid:0.4.1-SNAPSHOT
# https://grobid.readthedocs.io/en/latest/Grobid-docker/
# https://github.com/kennknowles/python-jsonpath-rw


class TextExtraction:
    def __init__(self, booktitles, journals):

        # The booktitles are located in the config.py
        # If you are interested in specific conference just add it there
        #self.booktitles = cfg.booktitles
        #self.journals = cfg.journals

        if booktitles is None:
            # GET THE VENUES WE LIKE from config.py
            self.booktitles = cfg.booktitles
            print('Conference of Interest: {}'.format(cfg.booktitles))
        else:
            self.booktitles = booktitles
            print('Conference of Interest: {}'.format(self.booktitles))

        if journals is None:
import sys
from lxml import etree
import gzip
import datetime

# modules to extract acm papers
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
# import time to set a sleeping mode to avoid HTTP error: 503/403/303
import time
import random
import json
import re

tools.setup_logging(file_name="xml_processor.log")

# add the number of access in acm to set sleep mode
num_of_access_in_acm = 0
# add the number of access in springer to set sleep mode
num_of_access_in_springer = 0
# add the number of access in aaai to set sleep mode
num_of_access_in_aaai = 0
# add the number of access in icwsm site to set sleep mode
num_of_access_in_icwsm = 0
# add the number of access in ieee site to set sleep mode
num_of_access_in_ieee = 0

num_of_access = 0

numOfPDFobtained = 0