def run(input_directory, articles_filename, citations_filename):
    writer = csv.writer(open(articles_filename, 'w'))
    citations_writer = csv.writer(open(citations_filename, 'w'))

    last_journal, i, j, started = None, 0, 0, time.time()

    for journal, filename, xml in get_graphs(input_directory):
        if last_journal != journal:
            last_journal, i, duration = journal, i + 1, time.time() - started
            if last_journal:
                print "%4d %4d %6.2f %6.4f %s" % (i, j, duration, (duration / j) if j else 0, last_journal)
            j, started = 0, time.time()
        j += 1

        for article in xml.xpath("/article-data/node[@type='article']"):
            fields = dict((n, '') for n in article_field_set)
            for datum in article.xpath('data'):
                if datum.attrib['key'] not in article_field_set:
                    continue
                fields[datum.attrib['key']] = ' '.join((datum.text or '').split()).encode('utf-8')
            fields['id'] = article.attrib['id']
            fields['filename'] = filename
#            fields['title'] = ' '.join(fields['title'].split())
#            fields['abstract'] = ' '.join(fields['abstract'].split())
            article = Article(**fields)
            writer.writerow(article)
            del fields, article

        for citation in xml.xpath("/article-data/edge[@type='cites']"):
            try:
                count = citation.xpath("data[@key='count']")[0].text
            except IndexError:
                count = ''
            citations_writer.writerow([citation.attrib['source'], citation.attrib['target'], count])
Ejemplo n.º 2
0
def run(input_directory, articles_filename, citations_filename):
    writer = csv.writer(open(articles_filename, 'w'))
    citations_writer = csv.writer(open(citations_filename, 'w'))

    last_journal, i, j, started = None, 0, 0, time.time()

    for journal, filename, xml in get_graphs(input_directory):
        if last_journal != journal:
            last_journal, i, duration = journal, i + 1, time.time() - started
            if last_journal:
                print "%4d %4d %6.2f %6.4f %s" % (i, j, duration,
                                                  (duration / j) if j else 0,
                                                  last_journal)
            j, started = 0, time.time()
        j += 1

        for article in xml.xpath("/article-data/node[@type='article']"):
            fields = dict((n, '') for n in article_field_set)
            for datum in article.xpath('data'):
                if datum.attrib['key'] not in article_field_set:
                    continue
                fields[datum.attrib['key']] = ' '.join(
                    (datum.text or '').split()).encode('utf-8')
            fields['id'] = article.attrib['id']
            fields['filename'] = filename
            #            fields['title'] = ' '.join(fields['title'].split())
            #            fields['abstract'] = ' '.join(fields['abstract'].split())
            article = Article(**fields)
            writer.writerow(article)
            del fields, article

        for citation in xml.xpath("/article-data/edge[@type='cites']"):
            try:
                count = citation.xpath("data[@key='count']")[0].text
            except IndexError:
                count = ''
            citations_writer.writerow(
                [citation.attrib['source'], citation.attrib['target'], count])
def run(input_directory, articles_filename):
    seen = set()
    parse_lock_filename = os.path.join(os.path.expanduser("~"), ".pubmed", "parse.lock")
    if os.path.exists(articles_filename) and os.path.exists(parse_lock_filename):
        os.rename(articles_filename, articles_filename + ".old")

    tar = tarfile.open(articles_filename, "w:gz")

    if os.path.exists(articles_filename + ".old"):
        print "Loading previous progress"
        try:
            old_tar = tarfile.open(articles_filename + ".old", "r:gz")
        except:
            print "Failed to load previous progress"
        else:
            for tar_info in old_tar:
                if tar_info.name.endswith(".json"):
                    seen.add(tar_info.name.replace(".json", ".nxml").rsplit("/", 1)[-1])
                data = old_tar.extractfile(tar_info)
                tar.addfile(tar_info, data)
            old_tar.close()
            os.unlink(articles_filename + ".old")
            print "Done loading previous progress, found %d articles" % len(seen)

    # Touch the lock file to say we've started.
    with open(parse_lock_filename, "w") as f:
        pass

    last_journal, i, j, started = None, 0, 0, time.time()
    url_mapping = get_source_url_mapping()

    for journal, filename, xml in get_graphs(input_directory, filter=seen_before(seen)):
        if last_journal != journal:
            last_journal, i, duration = journal, i + 1, time.time() - started
            if last_journal:
                print "%4d %4d %6.2f %6.4f %s" % (i, j, duration, (duration / j) if j else 0, last_journal)
            j, started = 0, time.time()
        j += 1

        record_list = []
        dataset = {"recordList": record_list}

        pmc = xml.xpath("/article-data/node[1]/data[@key='pmc']")
        source_url = None
        if not len(pmc):
            print "No PMC found for article"
        elif pmc[0].text in url_mapping:
            source_url = url_mapping[pmc[0].text]
        else:
            print "Couldn't find source URL for PMC%s (%s)" % (pmc[0].text, filename)

        for node in xml.xpath("/article-data/node"):
            data = Data(node)

            if not data._id:
                print "Missing id:", filename

            if data._type == "article":
                record = article_record(xml, node, data)
            elif data._type == "person":
                record = person_record(xml, node, data)
            elif data._type == "journal":
                record = journal_record(xml, node, data)
            elif data._type in ("organisation", "organization"):  # I can't spell
                record = organisation_record(xml, node, data)
            else:
                print data._type, filename
                print etree.tostring(node)
                continue

            if source_url:
                record["x-source-url"] = source_url
            record_list.append(record)

        tar_info = tarfile.TarInfo("pmc_open_access/%s/%s" % (journal, filename.replace(".nxml", ".json")))
        data = StringIO.StringIO()
        simplejson.dump(dataset, data)
        tar_info.size = data.len
        data.seek(0)
        tar.addfile(tar_info, data)

    if os.path.exists(parse_lock_filename):
        os.unlink(parse_lock_filename)
def run(input_directory, articles_filename):
    #
    # restore old progress: 
    #
    seen = set()

    parse_lock_filename = os.path.join(os.path.expanduser('~'), '.pubmed', 'parse.lock')
    if os.path.exists(articles_filename) and os.path.exists(parse_lock_filename):
        os.rename(articles_filename, articles_filename+'.old')

    tar = tarfile.open(articles_filename, 'w:gz')

    if os.path.exists(articles_filename+'.old'):
        print "Loading previous progress"
        try:
            old_tar = tarfile.open(articles_filename+'.old', 'r:gz')
        except:
            print "Failed to load previous progress"
        else:
            for tar_info in old_tar:
                if tar_info.name.endswith('.json'):
                    seen.add(tar_info.name.replace('.json', '.nxml').rsplit('/', 1)[-1])
                data = old_tar.extractfile(tar_info)
                tar.addfile(tar_info, data)
            old_tar.close()
            os.unlink(articles_filename+'.old')
            print "Done loading previous progress, found %d articles" % len(seen)



    # Touch the lock file to say we've started.
    with open(parse_lock_filename, 'w') as f: pass


    # for status reporting
    last_journal, i, j, started = None, 0, 0, time.time()

    # URL map is used for PMC_ID restoring
    url_mapping = get_source_url_mapping()

    # we disable directory filtering
    # for journal, filename, xml in get_graphs(input_directory, filter=subset): #seen_before(seen)):
    for journal, filename, xml in get_graphs(input_directory): #seen_before(seen)):
        #
        # We are looping through articles in input_dir parsed into etree objects: xml
        # 

        # status report
        if last_journal != journal:
            last_journal, i, duration = journal, i + 1, time.time() - started
            if last_journal:
                print "%4d %4d %6.2f %6.4f %s" % (i, j, duration, (duration / j) if j else 0, last_journal)
            j, started = 0, time.time()
        j += 1


        record_list = []
        dataset = {
            'recordList' : record_list,
        }


        #
        # Get PMC ID for current article
        # 
        pmc = xml.xpath("/article-data/node[1]/data[@key='pmc']")
        source_url = None
        if not len(pmc):
            print "No PMC found for article"
        elif pmc[0].text in url_mapping:
            source_url = url_mapping[pmc[0].text]
        else:
            print "Couldn't find source URL for PMC%s (%s)" % (pmc[0].text, filename)




        for node in xml.xpath("/article-data/node"):
            data = Data(node)

            if not data._id:
                print "Missing id:", filename

            if data._type == 'article':
                record = article_record(xml, node, data)
            elif data._type == 'person':
                record = person_record(xml, node, data)
            elif data._type == 'journal':
                record = journal_record(xml, node, data)
            elif data._type in ('organisation', 'organization'): # I can't spell
                record = organisation_record(xml, node, data)
            else:
                print data._type, filename
                print etree.tostring(node)
                continue

            if source_url:
                record['x-source-url'] = source_url
            record_list.append(record)


        tar_info = tarfile.TarInfo('pmc_open_access/%s/%s' % (journal, filename.replace('.nxml', '.json')))
        data = StringIO.StringIO()
        
        # There were errors using indent = '  '. 
        # It seems to work with an integer now.
        #simplejson.dump(dataset, data, indent='  ')
        simplejson.dump(dataset, data, indent = 2)
        tar_info.size = data.len
        data.seek(0)
        tar.addfile(tar_info, data)

    if os.path.exists(parse_lock_filename):
        os.unlink(parse_lock_filename)
Ejemplo n.º 5
0
def run(input_directory, articles_filename):
    #
    # restore old progress:
    #
    seen = set()

    parse_lock_filename = os.path.join(os.path.expanduser('~'), '.pubmed',
                                       'parse.lock')
    if os.path.exists(articles_filename) and os.path.exists(
            parse_lock_filename):
        os.rename(articles_filename, articles_filename + '.old')

    tar = tarfile.open(articles_filename, 'w:gz')

    if os.path.exists(articles_filename + '.old'):
        print "Loading previous progress"
        try:
            old_tar = tarfile.open(articles_filename + '.old', 'r:gz')
        except:
            print "Failed to load previous progress"
        else:
            for tar_info in old_tar:
                if tar_info.name.endswith('.json'):
                    seen.add(
                        tar_info.name.replace('.json', '.nxml').rsplit('/',
                                                                       1)[-1])
                data = old_tar.extractfile(tar_info)
                tar.addfile(tar_info, data)
            old_tar.close()
            os.unlink(articles_filename + '.old')
            print "Done loading previous progress, found %d articles" % len(
                seen)

    # Touch the lock file to say we've started.
    with open(parse_lock_filename, 'w') as f:
        pass

    # for status reporting
    last_journal, i, j, started = None, 0, 0, time.time()

    # URL map is used for PMC_ID restoring
    url_mapping = get_source_url_mapping()

    # we disable directory filtering
    # for journal, filename, xml in get_graphs(input_directory, filter=subset): #seen_before(seen)):
    for journal, filename, xml in get_graphs(
            input_directory):  #seen_before(seen)):
        #
        # We are looping through articles in input_dir parsed into etree objects: xml
        #

        # status report
        if last_journal != journal:
            last_journal, i, duration = journal, i + 1, time.time() - started
            if last_journal:
                print "%4d %4d %6.2f %6.4f %s" % (i, j, duration,
                                                  (duration / j) if j else 0,
                                                  last_journal)
            j, started = 0, time.time()
        j += 1

        record_list = []
        dataset = {
            'recordList': record_list,
        }

        #
        # Get PMC ID for current article
        #
        pmc = xml.xpath("/article-data/node[1]/data[@key='pmc']")
        source_url = None
        if not len(pmc):
            print "No PMC found for article"
        elif pmc[0].text in url_mapping:
            source_url = url_mapping[pmc[0].text]
        else:
            print "Couldn't find source URL for PMC%s (%s)" % (pmc[0].text,
                                                               filename)

        for node in xml.xpath("/article-data/node"):
            data = Data(node)

            if not data._id:
                print "Missing id:", filename

            if data._type == 'article':
                record = article_record(xml, node, data)
            elif data._type == 'person':
                record = person_record(xml, node, data)
            elif data._type == 'journal':
                record = journal_record(xml, node, data)
            elif data._type in ('organisation',
                                'organization'):  # I can't spell
                record = organisation_record(xml, node, data)
            else:
                print data._type, filename
                print etree.tostring(node)
                continue

            if source_url:
                record['x-source-url'] = source_url
            record_list.append(record)

        tar_info = tarfile.TarInfo(
            'pmc_open_access/%s/%s' %
            (journal, filename.replace('.nxml', '.json')))
        data = StringIO.StringIO()

        # There were errors using indent = '  '.
        # It seems to work with an integer now.
        #simplejson.dump(dataset, data, indent='  ')
        simplejson.dump(dataset, data, indent=2)
        tar_info.size = data.len
        data.seek(0)
        tar.addfile(tar_info, data)

    if os.path.exists(parse_lock_filename):
        os.unlink(parse_lock_filename)