def setUp(self): self.box = mongobox.MongoBox(scripting=True, auth=True) self.box.start() self.boxclient = self.box.client() self.boxclient['admin'].add_user('foo','bar') self.boxclient['admin'].authenticate('foo','bar') self.boxclient['test'].add_user('test','test') base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) config = utils.load_config(os.path.join(base_dir, 'test/adsdata.cfg.test')) config['ADSDATA_MONGO_DATABASE'] = 'test' config['ADSDATA_MONGO_HOST'] = 'localhost' config['ADSDATA_MONGO_PORT'] = self.box.port config['ADSDATA_MONGO_USER'] = '******' config['ADSDATA_MONGO_PASSWORD'] = '******' self.config = config self.session = utils.get_session(config) load_data(self.config)
def setUp(self): self.box = mongobox.MongoBox(scripting=True, auth=True) self.box.start() self.boxclient = self.box.client() self.boxclient['admin'].add_user('foo','bar') self.boxclient['admin'].authenticate('foo','bar') self.boxclient['test'].add_user('test','test') base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) config = utils.load_config(os.path.join(base_dir, 'adsdata.cfg')) config['ADSDATA_MONGO_DATABASE'] = 'test' config['ADSDATA_MONGO_HOST'] = 'localhost' config['ADSDATA_MONGO_PORT'] = self.box.port config['ADSDATA_MONGO_USER'] = '******' config['ADSDATA_MONGO_PASSWORD'] = '******' self.config = config self.session = utils.get_session(config) load_data(self.config)
def __init__(self, bibcode, ft_source, provider, config=False): if not config: self.config = utils.load_config() else: self.config = config self.bibcode = bibcode self.ft_source = ft_source self.provider = provider self.extract_dir = self.config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode) self.meta_path = os.path.join(self.extract_dir, 'meta.json') self.source_loaded = False self.source_content = None self.dry_run = False self.last_extracted = self.get_last_extracted() log.debug("%s last extracted: %s", self.bibcode, self.last_extracted)
def __init__(self, bibcode, ft_source, provider, config=False): if not config: self.config = utils.load_config() else: self.config = config self.bibcode = bibcode self.ft_source = ft_source self.provider = provider self.extract_dir = self.config[ 'FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode) self.meta_path = os.path.join(self.extract_dir, 'meta.json') self.source_loaded = False self.source_content = None self.dry_run = False self.last_extracted = self.get_last_extracted() log.debug("%s last extracted: %s", self.bibcode, self.last_extracted)
from java.lang import String from java.lang import Thread from java.util import HashMap from java.util.concurrent import Executors, TimeUnit from java.util.concurrent import Callable from java.lang import InterruptedException from org.apache.pdfbox.pdfparser import PDFParser from org.apache.pdfbox.pdmodel import PDDocument from org.apache.pdfbox.util import PDFTextStripper, TextNormalize from com.rabbitmq.client import ConnectionFactory from com.rabbitmq.client import Connection from com.rabbitmq.client import Channel from com.rabbitmq.client import QueueingConsumer from com.rabbitmq.client.AMQP import BasicProperties config = utils.load_config() class PdfExtractor(Callable): def __init__(self, channel, opts): self.channel = channel self.opts = opts self.consumer = QueueingConsumer(channel) def call(self): log = logging.getLogger() self.channel.basicConsume(opts.queue_name, False, self.consumer) log.info("Awaiting pdf extraction tasks on %s...", opts.queue_name) while True: delivery = self.consumer.nextDelivery() props = delivery.getProperties()
op.set_usage("usage: build_docs.py [options] [%s]" % '|'.join(commands.map.keys())) op.add_option('-i', '--infile', dest="infile", action="store") op.add_option('-s', '--source_model', dest="source_model", action="store", default="Accno") op.add_option('-t','--threads', dest="threads", action="store", type=int, default=cpu_count()) # * 2) op.add_option('-l','--limit', dest="limit", action="store", type=int) op.add_option('-r','--remove', dest="remove", action="store_true", default=False) op.add_option('-d','--debug', dest="debug", action="store_true", default=False) op.add_option('-v','--verbose', dest="verbose", action="store_true", default=False) op.add_option('--profile', dest='profile', action='store_true', help='capture program execution profile', default=False) op.add_option('--pygraph', dest='pygraph', action='store_true', help='capture exec profile in a call graph image', default=False) opts, args = op.parse_args() base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) config = utils.load_config(os.path.join(base_dir, 'adsdata.cfg')) log = utils.init_logging(base_dir, opts.verbose, opts.debug) if opts.debug: log.setLevel(logging.DEBUG) try: cmd = args.pop() assert cmd in commands.map except (IndexError,AssertionError): op.error("missing or invalid command") start_cpu = time.clock() start_real = time.time() if opts.profile:
import re import os import sys import time import json import ptree import logging import itertools from itertools import imap, islice, ifilter from optparse import OptionParser from pymongo import MongoClient from multiprocessing import Process, JoinableQueue, Manager from adsdata import utils config = utils.load_config() commands = utils.commandList() log = logging.getLogger() class Worker(Process): def __init__(self, queue, opts, stats): Process.__init__(self) self.opts = opts self.queue = queue self.stats = stats def run(self): while True: doc = self.queue.get() if doc is None:
import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if sys.version_info < (2,7): import unittest2 as unittest else: import unittest from adsdata import extractors, utils base_dir = utils.get_script_path(file_name_space=__file__) config_file = os.path.join(base_dir, 'adsdata.cfg.test') config = utils.load_config(config_file) class FulltextTestCase(unittest.TestCase): def test_extractor_factory(self): test_input = [ ('2000xxx..999..1234L', 'http://foo/bar/baz', 'Foo', extractors.HttpExtractor), ('2000xxx..999..1234L', '/foo/bar/baz.pdf', 'Foo', extractors.PdfExtractor), ('2000xxx..999..1234L', '/foo/bar/baz.xml', 'Foo', extractors.XMLExtractor), ('2000xxx..999..1234L', '/foo/bar/baz.xml', 'Elsevier', extractors.ElsevierExtractor), ('2000xxx..999..1234L', '/foo/bar/baz.ocr', 'Foo', extractors.PlainTextExtractor), ('2000xxx..999..1234L', '/foo/bar/baz.txt', 'Foo', extractors.PlainTextExtractor), ('2000xxx..999..1234L', '/foo/bar.html,/foo/baz.html', 'Foo', extractors.HtmlExtractor) ] for bib, path, prov, cls in test_input: ext = extractors.Extractor.factory(bib, path, prov, config) self.assertTrue(isinstance(ext, cls))
of records found """ records = [] with open(file) as f: for b in biblist: sys.stderr.write("searching for %s in file %s\n" % (b, file)) res = find_key_in_sorted_file(b, f, fold=True) for r in res: records.append("\t".join([ str(k) for k in r ])) return records if __name__ == "__main__": demo_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'demo_data') base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) config = utils.load_config(os.path.join(base_dir, 'adsdata.cfg')) for f in os.listdir(demo_dir): abs_path = os.path.join(demo_dir, f) cname = os.path.splitext(f)[0] cfile = config.get('collections',{}).get(cname) if not cfile: sys.stderr.write("no file found for collection %s, skipped\n" % cname) continue # read bibcodes from local collection, look them up in global file bibcodes = get_bibcodes_from_json(abs_path) sys.stderr.write("read %d bibcodes from file %s\n" % (len(bibcodes), abs_path)) # sys.stderr.write("first record: %s\n" % str(bibcodes[0])) records = bibcode_lookup(cfile, bibcodes) if not len(records): sys.stderr.write("no records found in file %s\n" % cfile)