Esempio n. 1
0
File: test.py Progetto: gsf/ptree
 def test_id2ptree(self):
     for case in self.i2ptree_tests:
         if len(case) == 3:
             result = id2ptree(case[0])
         elif len(case) == 4: # uses custom separator
             result = id2ptree(case[0], sep=case[3])
         msg = "%s: id2ptree(%s) = %s but got %s" % \
                 (case[2], case[0], case[1], result)
         self.assertEqual(result, case[1], msg=msg)
Esempio n. 2
0
def download():
    count = 0
    for item in get_items():
        count += 1

        id = item.find('./{http://tempuri.org/}MapaDigital').text
        if not id or id == "0":
            continue

        # write out metadata
        data_dir = 'data' + ptree.id2ptree(id)
        if not os.path.isdir(data_dir):
            os.makedirs(data_dir)
        et.cleanup_namespaces(item)
        xml = et.tostring(item, pretty_print=True)

	# no need to refetch
        metadata_file = os.path.join(data_dir, "%s.xml" % id)
        if os.path.isfile(metadata_file):
            continue
        open(metadata_file, "w").write(xml)

        # be nice :)
        time.sleep(1)

        # try to download shapefile
        zip_url = item.find('./{http://tempuri.org/}UrlZip').text
        if zip_url:
            r = requests.get(zip_url, headers={"User-Agent": UA})
            if r.headers['Content-Type'] == 'application/x-zip-compressed':
                zip_file = metadata_file.replace(".xml", ".zip")
                open(zip_file, "wb").write(r.content)
        	print "%s %s %s %s" % (datetime.datetime.now(), count, id, zip_file)
Esempio n. 3
0
def fetch(url):
    """
    GETs a url, extracts RDFa from it, and persists it to disk. fetch()
    will return the name of the file where the metadata was stored, or
    None if the RDF was already fetched.
    """
    dirname = "store" + ptree.id2ptree(url)
    path = os.path.join(dirname, "metadata.nt")

    # if it's there already don't bother getting it again
    if os.path.isfile(path):
        logging.info("already harvested %s as %s" % (url, path))
        return None

    # create the directory if necessary
    if not os.path.isdir(dirname):
        os.makedirs(dirname)

    # extract rdfa and save it
    try:
        graph = rdflib.Graph()
        html = urllib.urlopen(url).read()
        graph.parse(data=html, format="rdfa")
        triples = len(graph)
        graph.serialize(open(path, "w"), format="nt")
        logging.info("saved %s as %i triples in %s" % (url, triples, path))
    except: 
        logging.exception("unable to extract rdfa from %s" % url)

    # be nice
    time.sleep(2)
    return path
Esempio n. 4
0
def harvest(base_url, metadata_prefix='oai_dc', set_name=None):
    url = "%s?verb=ListRecords&metadataPrefix=%s" % (base_url, metadata_prefix)
    if set_name:
        url += "&set=%s" % set_name

    while True:
        doc = etree.parse(url)
        for record in doc.xpath('oai:ListRecords/oai:record', namespaces=ns):
            # determine the record identifier
            id = record.xpath('string(oai:header/oai:identifier)', namespaces=ns)

            # write out the record to a pair tree
            d = "data" + id2ptree(id)

            if not os.path.isdir(d):
                os.makedirs(d)

            p = os.path.join(d, "%s-%s.xml" % (id, metadata_prefix))
            open(p, "w").write(etree.tostring(record))

            print "saved %s as %s" % (id, p)

        # handle resumption token
        t = doc.xpath('string(oai:ListRecords/oai:resumptionToken)', namespaces=ns)
        if not t:
            break
        url = "%s?verb=ListRecords&resumptionToken=%s" % (base_url, t)
Esempio n. 5
0
def load_fulltext(bibcode, field_name):
    ptree_path = ptree.id2ptree(bibcode)
    # TODO: make this path a config setting again
    full_path = '/proj/ads/fulltext/extracted%s%s.txt' % (ptree_path, field_name)
    if os.path.exists(full_path):
        fo = open(full_path, 'r')
        text = fo.read()
        fo.close()
        return text.decode('utf-8')
    else:
        return u""
Esempio n. 6
0
def display_image(bibcode,figure_id,image_format):
    """
    For a given article, figure ID and format, fetch and display the image
    """
    format2ext = {'tb':'gif','lr':'jpg','hr':'png'}
    image_ext = format2ext.get(image_format,'png')
    image_dir = config.IMAGE_PATH + ptree.id2ptree(bibcode)
    image = "%s%s_%s_%s.%s" % (image_dir,bibcode,figure_id,image_format,image_ext)
    try:
        image_data = open(image, "rb").read()
    except Exception, e:
        app.logger.error('ID %s. Unable to get image %s (format: %s) for bibcode : %s! (%s)' % (g.user_cookie_id,figure_id,image_format,bibcode,e))
        return ('', 204)
Esempio n. 7
0
 def __init__(self, bibcode, ft_source, provider):
     
     self.bibcode = bibcode
     self.ft_source = ft_source
     self.provider = provider
     self.extract_dir = config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode)
     self.meta_path = os.path.join(self.extract_dir, 'meta.json')
     self.source_loaded = False
     self.source_content = None
     self.dry_run = False
     
     self.last_extracted = self.get_last_extracted()
     log.debug("%s last extracted: %s", self.bibcode, self.last_extracted)
Esempio n. 8
0
def create_meta_path(dict_input, extract_path):
    """
    Converts the BibCode of the file into a pair tree path name. For example,
    2015TEST would be converted into '20/15/TE/ST/'.

    :param dict_input: meta-data content of the article given
    :param extract_key: path to extract the full text content to
    :return: BibCodes pair tree path
    """

    ptr = ptree.id2ptree(dict_input['bibcode'])
    extract_path = extract_path + ptr + 'meta.json'
    logger.debug('extract_path: {0}'.format(extract_path))

    return extract_path
Esempio n. 9
0
def create_meta_path(dict_input, extract_key='FULLTEXT_EXTRACT_PATH'):
    """
    Converts the BibCode of the file into a pair tree path name. For example,
    2015TEST would be converted into '20/15/TE/ST/'.

    :param dict_input: meta-data content of the article given
    :param extract_key: path to extract the full text content to
    :return: BibCodes pair tree path
    """

    ptr = ptree.id2ptree(dict_input[CONSTANTS['BIBCODE']])
    extract_path = config[extract_key] + ptr + 'meta.json'
    logger.debug('extract_path: {0}'.format(extract_path))

    return extract_path
Esempio n. 10
0
def create_meta_path(dict_input, extract_path):
    """
    Converts the BibCode of the file into a pair tree path name. For example,
    2015TEST would be converted into '20/15/TE/ST/'.

    :param dict_input: meta-data content of the article given
    :param extract_key: path to extract the full text content to
    :return: BibCodes pair tree path
    """

    ptr = ptree.id2ptree(dict_input['bibcode'])
    extract_path = extract_path + ptr + 'meta.json'
    logger.debug('extract_path: {0}'.format(extract_path))

    return extract_path
Esempio n. 11
0
    def run(self):
        while True:
            doc = self.queue.get()
            if doc is None:
                log.debug("Nothing left to do for worker %s", self.name)
                self.queue.task_done()
                break

            self.stats['processed'] += 1
            log.info("Worker %s is working on %s", self.name, doc['bibcode'])

            extract_dir = config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(
                doc['bibcode'])
            meta_path = os.path.join(extract_dir, 'meta.json')
            log.debug("meta path: %s", meta_path)

            # dry-run testing
            #            self.queue.task_done()
            #            continue

            if not os.path.exists(extract_dir):
                log.debug("no existing extract dir for %s", doc['bibcode'])
                self.stats['missing'] += 1
                self.queue.task_done()
                continue

            if os.path.exists(meta_path) and not self.opts.force:
                log.debug("found existing meta file for %s", doc['bibcode'])
                self.queue.task_done()
                continue

            meta = {
                'ft_source': doc['ft_source'],
                'provider': doc['ft_provider'],
                'index_date': doc['index_date']
            }

            log.debug("writing meta file for %s", doc['bibcode'])
            with open(meta_path, 'w') as f:
                json.dump(meta, f)

            mtime = time.mktime(doc['_generated'].timetuple())
            log.debug("setting mtime for %s to %s, %s", meta_path,
                      doc['_generated'], mtime)
            os.utime(meta_path, (mtime, mtime))

            self.queue.task_done()
Esempio n. 12
0
    def run(self):
        while True:
            doc = self.queue.get()
            if doc is None:
                log.debug("Nothing left to do for worker %s", self.name)
                self.queue.task_done()
                break

            self.stats['processed'] += 1
            log.info("Worker %s is working on %s", self.name, doc['bibcode'])
            
            extract_dir = config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(doc['bibcode'])
            meta_path = os.path.join(extract_dir, 'meta.json')
            log.debug("meta path: %s", meta_path)
            
            # dry-run testing
#            self.queue.task_done()
#            continue
         
            if not os.path.exists(extract_dir):
                log.debug("no existing extract dir for %s", doc['bibcode'])
                self.stats['missing'] += 1
                self.queue.task_done()
                continue
            
            if os.path.exists(meta_path) and not self.opts.force:
                log.debug("found existing meta file for %s", doc['bibcode'])
                self.queue.task_done()
                continue
            
            meta = {
                'ft_source': doc['ft_source'],
                'provider': doc['ft_provider'],
                'index_date': doc['index_date']
                }
            
            log.debug("writing meta file for %s", doc['bibcode'])
            with open(meta_path,'w') as f:
                json.dump(meta, f)
            
            mtime = time.mktime(doc['_generated'].timetuple())
            log.debug("setting mtime for %s to %s, %s", meta_path, doc['_generated'], mtime)
            os.utime(meta_path, (mtime, mtime))
            
            self.queue.task_done()
Esempio n. 13
0
    def __init__(self, bibcode, ft_source, provider, config=False):

        if not config:
            self.config = utils.load_config()
        else:
            self.config = config

        self.bibcode = bibcode
        self.ft_source = ft_source
        self.provider = provider
        self.extract_dir = self.config[
            'FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode)
        self.meta_path = os.path.join(self.extract_dir, 'meta.json')
        self.source_loaded = False
        self.source_content = None
        self.dry_run = False

        self.last_extracted = self.get_last_extracted()
        log.debug("%s last extracted: %s", self.bibcode, self.last_extracted)
Esempio n. 14
0
def workout_field_value(message):
    sender = str(message.getSender())
    if sender in 'PythonTextField':
        value = message.getParam('externalVal')
        if not value:
            return
        value = str(value)
        message.threadInfo('searching for ' + value)
        vals = {}
        ret = None
        if value:
            parts = value.split('|')
            for p in parts:
                k, v = p.split(':', 1)
                if v[0] == '[' and v[-1] == ']':
                    v = v[1:-1]
                vals[k] = v
            if 'bibcode' in vals and 'src_dir' in vals:
                if vals['src_dir'] == "mongo":
                    mongo = pymongo.Connection('adszee')
                    docs = mongo['solr4ads']['docs']
                    bib = vals['bibcode']
                    doc = docs.find_one({'bibcode': bib}, {'body': 1})
                    if doc:
                        message.setResults(doc['body'])
                        return
                else:
                    dirs = vals['src_dir'].split(',')
                    bib = vals['bibcode'].split(',')[0].strip()
                    ptree_path = ptree.id2ptree(bib)
    
                    for d in dirs:
                        full_path = d + ptree_path + 'body.txt'
                        message.threadInfo('looking for ' + full_path)
                        if os.path.exists(full_path):
                            fo = open(full_path, 'r')
                            ret = fo.read()
                            message.setResults(ret.decode('utf-8'))
                            return
Esempio n. 15
0
    def get_path_to_htid(self, htid):
        """ Returns the path to the pairtree directory for this htid.

            Args should include the id namespace, eg:
                dul1.ark:/13960/t00z7x54f
                uc2.ark:/13960/t9p26rn3h
                etc.

            Returns a tuple - (path, postfix)
        """

        ns, post = htid.split('.')
        posttree = ptree.id2ptree(post)
        posttree = posttree.strip('/') # / at front of string break path join
        post = self.encode(post) # replace :, /, etc.

        l = [self.cpath, ns, 'pairtree_root', posttree, post]
        fullpath = os.path.join(*l)

        if not os.path.exists(fullpath):
            raise ValueError("Is id {} in the collection? Path {} not found."
                    .format(htid, fullpath))
        return fullpath, post
Esempio n. 16
0
File: ms.py Progetto: gsf/ms
def get_response():
    accept_formats = os.environ.get('HTTP_ACCEPT', [])
    if accept_formats:
        accept_formats = accept_formats.split(',')
    form = cgi.FieldStorage()
    #print 'Content-Type: text/plain\n'
    #print form
    #print 'Content-Type: text/html\n'
    #cgi.print_environ()

    if os.environ['REQUEST_METHOD'] == 'GET':
        identity = None
        filename = 'meta.json'
        if 'id' in form:
            identity = form['id'].value
        if 'f' in form:
            filename = form['f'].value
        if identity:
            ppath = ptree.id2ptree(identity)
            name = os.path.basename(filename)
            url = 'http://data.free103point9.org/r%s%s' % (ppath, name)
            return '''Content-Type: text/plain
Location: %(url)s

%(url)s''' % {'url': url}
            
    if os.environ['REQUEST_METHOD'] == 'POST':
        if 'id' in form and 'file' in form:
            identity = form['id'].value
            # A nested FieldStorage instance holds the file
            fileitem = form['file']

            if identity and fileitem.filename:
                ppath = ptree.id2ptree(identity)
                home = '../r%s' % ppath
                try:
                    os.makedirs(home)
                except OSError:
                    pass
                # strip leading path from file name to avoid directory traversal attacks
                name = os.path.basename(fileitem.filename)
                f = open('../r%s%s' % (ppath, name), 'wb', 10000)

                # Read the file in chunks
                for chunk in fbuffer(fileitem.file):
                   f.write(chunk)
                f.close()
                log(datetime.now().isoformat() + ' POST ' + identity + ' ' + name)
                message = "The file %s was uploaded successfully" % name
                if 'text/html' in accept_formats:
                    return html_response('<p>%s</p>' % message)
                else:
                    return 'Content-Type: text/plain\n\n%s' % message

    if os.environ['REQUEST_METHOD'] == 'DELETE':
        if 'id' in form and 'filename' in form:
            identity = form['id'].value
            filename = form['filename'].value
            if identity and filename:
                ppath = ptree.id2ptree(identity)
                dir = '../r%s' % ppath
                name = os.path.basename(filename)
                os.remove(dir + name)
                try:
                    os.removedirs(dir) # remove parent directories if empty
                except OSError:
                    pass
                log(datetime.now().isoformat() + ' DELETE ' + identity + ' ' + name)
                message = "The file %s was deleted successfully" % name
                if 'text/html' in accept_formats:
                    return html_response('<p></p>')
                else:
                    return 'Content-Type: text/plain\n\n%s' % message
                

    if 'text/html' in accept_formats:
        return html_response("""
<form enctype="multipart/form-data" method="post">
  <input type="text" name="id">
  <input type="file" name="file">
  <input type="submit">
</form>""")
    else:
        return 'Content-Type: text/plain\n\ndata.free103point9.org'
Esempio n. 17
0
#!/usr/bin/env python

import os
import json
import ptree

from internetarchive import search_items, Item

total_bytes = 0

for result in search_items('collection:usda-nurseryandseedcatalog'):
    id = result['identifier']
    item = Item(id)
    metadata = item.get_metadata()
    item_dir = os.path.join('items', ptree.id2ptree(id).lstrip("/"))
    if not os.path.isdir(item_dir):
        os.makedirs(item_dir)
    with open(os.path.join(item_dir, 'metadata.json'), 'w') as fh:
        fh.write(json.dumps(metadata, indent=2))

    total_bytes += sum([f.size for f in item.iter_files()])
    print item_dir

print total_bytes
Esempio n. 18
0
Check for the existance of the fulltext body and meta.json files

AA 11/1/16
"""

import json
import ptree
import fileinput
import os
from settings import config

if __name__ == '__main__':

    for line in fileinput.input():
        bibcode, fname, provider = line.strip().split()
        f = config['FULLTEXT_EXTRACT_PATH'] + ptree.id2ptree(bibcode)
        if not os.path.exists(f):
            print "{0}: missing_dir   {1}".format(bibcode, f)
        meta = f + 'meta.json'
        full = f + 'fulltext.txt'
        if not os.path.exists(meta):
            print "{0} : missing_meta {1}".format(bibcode, meta)
            continue
        if not os.path.exists(full):
            print "{0} : missing_ft   {1}".format(bibcode, full)
        try:
            d = json.load(open(meta))
            ts = d['index_date']
        except KeyError:
            print "{0}: missing_date  {1}".format(bibcode, meta)