Esempio n. 1
0
 def setUp(self):
     self.box = mongobox.MongoBox(scripting=True, auth=True)
     self.box.start()
     self.boxclient = self.box.client()
     self.boxclient['admin'].add_user('foo','bar')
     self.boxclient['admin'].authenticate('foo','bar')
     self.boxclient['test'].add_user('test','test')
     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     config = utils.load_config(os.path.join(base_dir, 'test/adsdata.cfg.test'))
     config['ADSDATA_MONGO_DATABASE'] = 'test'
     config['ADSDATA_MONGO_HOST'] = 'localhost'
     config['ADSDATA_MONGO_PORT'] = self.box.port
     config['ADSDATA_MONGO_USER'] = '******'
     config['ADSDATA_MONGO_PASSWORD'] = '******'
     self.config = config
     self.session = utils.get_session(config)
     load_data(self.config)
Esempio n. 2
0
 def setUp(self):
     self.box = mongobox.MongoBox(scripting=True, auth=True)
     self.box.start()
     self.boxclient = self.box.client()
     self.boxclient['admin'].add_user('foo','bar')
     self.boxclient['admin'].authenticate('foo','bar')
     self.boxclient['test'].add_user('test','test')
     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     config = utils.load_config(os.path.join(base_dir, 'adsdata.cfg'))
     config['ADSDATA_MONGO_DATABASE'] = 'test'
     config['ADSDATA_MONGO_HOST'] = 'localhost'
     config['ADSDATA_MONGO_PORT'] = self.box.port
     config['ADSDATA_MONGO_USER'] = '******'
     config['ADSDATA_MONGO_PASSWORD'] = '******'
     self.config = config
     self.session = utils.get_session(config)
     load_data(self.config)
Esempio n. 3
0
    def __init__(self, bibcode, ft_source, provider, config=False):
        
        if not config:
            self.config = utils.load_config()
        else:
            self.config = config

        self.bibcode = bibcode
        self.ft_source = ft_source
        self.provider = provider      
        self.extract_dir = self.config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode)
        self.meta_path = os.path.join(self.extract_dir, 'meta.json')
        self.source_loaded = False
        self.source_content = None
        self.dry_run = False
        
        self.last_extracted = self.get_last_extracted()
        log.debug("%s last extracted: %s", self.bibcode, self.last_extracted)
Esempio n. 4
0
    def __init__(self, bibcode, ft_source, provider, config=False):

        if not config:
            self.config = utils.load_config()
        else:
            self.config = config

        self.bibcode = bibcode
        self.ft_source = ft_source
        self.provider = provider
        self.extract_dir = self.config[
            'FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode)
        self.meta_path = os.path.join(self.extract_dir, 'meta.json')
        self.source_loaded = False
        self.source_content = None
        self.dry_run = False

        self.last_extracted = self.get_last_extracted()
        log.debug("%s last extracted: %s", self.bibcode, self.last_extracted)
Esempio n. 5
0
from java.lang import String
from java.lang import Thread
from java.util import HashMap
from java.util.concurrent import Executors, TimeUnit
from java.util.concurrent import Callable
from java.lang import InterruptedException
from org.apache.pdfbox.pdfparser import PDFParser
from org.apache.pdfbox.pdmodel import PDDocument
from org.apache.pdfbox.util import PDFTextStripper, TextNormalize
from com.rabbitmq.client import ConnectionFactory
from com.rabbitmq.client import Connection
from com.rabbitmq.client import Channel
from com.rabbitmq.client import QueueingConsumer
from com.rabbitmq.client.AMQP import BasicProperties

config = utils.load_config()

class PdfExtractor(Callable):

    def __init__(self, channel, opts):
        self.channel = channel
        self.opts = opts
        self.consumer = QueueingConsumer(channel)
    
    def call(self):
        log = logging.getLogger()
        self.channel.basicConsume(opts.queue_name, False, self.consumer)
        log.info("Awaiting pdf extraction tasks on %s...", opts.queue_name)
        while True:
            delivery = self.consumer.nextDelivery()
            props = delivery.getProperties()
Esempio n. 6
0
    op.set_usage("usage: build_docs.py [options] [%s]" % '|'.join(commands.map.keys()))
    op.add_option('-i', '--infile', dest="infile", action="store")
    op.add_option('-s', '--source_model', dest="source_model", action="store", default="Accno")
    op.add_option('-t','--threads', dest="threads", action="store", type=int, default=cpu_count()) # * 2)
    op.add_option('-l','--limit', dest="limit", action="store", type=int)
    op.add_option('-r','--remove', dest="remove", action="store_true", default=False)
    op.add_option('-d','--debug', dest="debug", action="store_true", default=False)
    op.add_option('-v','--verbose', dest="verbose", action="store_true", default=False)
    op.add_option('--profile', dest='profile', action='store_true',
        help='capture program execution profile', default=False)
    op.add_option('--pygraph', dest='pygraph', action='store_true',
        help='capture exec profile in a call graph image', default=False)
    opts, args = op.parse_args() 
    
    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    config = utils.load_config(os.path.join(base_dir, 'adsdata.cfg'))

    log = utils.init_logging(base_dir, opts.verbose, opts.debug)
    if opts.debug:
        log.setLevel(logging.DEBUG)

    try:
        cmd = args.pop()
        assert cmd in commands.map
    except (IndexError,AssertionError):
        op.error("missing or invalid command")
        
    start_cpu = time.clock()
    start_real = time.time()        
    
    if opts.profile:
Esempio n. 7
0
import re
import os
import sys
import time
import json
import ptree
import logging
import itertools
from itertools import imap, islice, ifilter
from optparse import OptionParser
from pymongo import MongoClient
from multiprocessing import Process, JoinableQueue, Manager

from adsdata import utils

config = utils.load_config()
commands = utils.commandList()
log = logging.getLogger()


class Worker(Process):
    def __init__(self, queue, opts, stats):
        Process.__init__(self)
        self.opts = opts
        self.queue = queue
        self.stats = stats

    def run(self):
        while True:
            doc = self.queue.get()
            if doc is None:
Esempio n. 8
0
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

if sys.version_info < (2,7):
    import unittest2 as unittest
else:
    import unittest
    
from adsdata import extractors, utils
base_dir = utils.get_script_path(file_name_space=__file__)
config_file = os.path.join(base_dir, 'adsdata.cfg.test')
config = utils.load_config(config_file)


class FulltextTestCase(unittest.TestCase):
    
    def test_extractor_factory(self):
        test_input = [
            ('2000xxx..999..1234L', 'http://foo/bar/baz', 'Foo', extractors.HttpExtractor),
            ('2000xxx..999..1234L', '/foo/bar/baz.pdf', 'Foo', extractors.PdfExtractor),
            ('2000xxx..999..1234L', '/foo/bar/baz.xml', 'Foo', extractors.XMLExtractor),
            ('2000xxx..999..1234L', '/foo/bar/baz.xml', 'Elsevier', extractors.ElsevierExtractor),
            ('2000xxx..999..1234L', '/foo/bar/baz.ocr', 'Foo', extractors.PlainTextExtractor),
            ('2000xxx..999..1234L', '/foo/bar/baz.txt', 'Foo', extractors.PlainTextExtractor),
            ('2000xxx..999..1234L', '/foo/bar.html,/foo/baz.html', 'Foo', extractors.HtmlExtractor)
            ]
        for bib, path, prov, cls in test_input:
            ext = extractors.Extractor.factory(bib, path, prov, config)
            self.assertTrue(isinstance(ext, cls))
Esempio n. 9
0
    of records found
    """
    records = []
    with open(file) as f:
        for b in biblist:
            sys.stderr.write("searching for %s in file %s\n" % (b, file))
            res = find_key_in_sorted_file(b, f, fold=True)
            for r in res:
                records.append("\t".join([ str(k) for k in r ]))
    return records

if __name__ == "__main__":

    demo_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'demo_data')
    base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    config = utils.load_config(os.path.join(base_dir, 'adsdata.cfg'))

    for f in os.listdir(demo_dir):
        abs_path = os.path.join(demo_dir, f)
        cname = os.path.splitext(f)[0]
        cfile = config.get('collections',{}).get(cname)
        if not cfile:
            sys.stderr.write("no file found for collection %s, skipped\n" % cname)
            continue
        # read bibcodes from local collection, look them up in global file
        bibcodes = get_bibcodes_from_json(abs_path)
        sys.stderr.write("read %d bibcodes from file %s\n" % (len(bibcodes), abs_path))
#        sys.stderr.write("first record: %s\n" % str(bibcodes[0]))
        records = bibcode_lookup(cfile, bibcodes)
        if not len(records):
            sys.stderr.write("no records found in file %s\n" % cfile)