Python is_pdf Examples, invenio.bibclassify_text_extractor.is_pdf Python Examples

Example #1

0

Show file

def generate_keywords(req, recid, store_keywords=True):
    req.write(
        "Please be patient while the keywords classification is running...")

    bibdocfiles = BibRecDocs(recid).list_latest_files()

    keywords = []
    for doc in bibdocfiles:
        # Get the keywords for each PDF document contained in the record.
        if is_pdf(doc.get_full_path()):
            fulltext = doc.get_full_path()
            from invenio.bibclassify_engine import get_keywords_from_local_file
            single_keywords, composite_keywords = get_keywords_from_local_file(
                fulltext, taxonomy='HEP', with_author_keywords=True)

            for keyword, spans in single_keywords.items():
                keywords.append([keyword.concept, len(spans)])
            for keyword, num, components in composite_keywords:
                keywords.append([keyword.concept, num])

    if keywords and store_keywords:
        output = [
            '<collection><record>\n'
            '<controlfield tag="001">%s</controlfield>' % recid
        ]

        output.append(
            output_marc(single_keywords,
                        composite_keywords,
                        spires=False,
                        taxonomy='HEP'))

        output.append('</record></collection>')

        tmp_directory = "%s/bibclassify" % CFG_TMPDIR
        filename = "bibclassifyd_%s.xml" % time.strftime(
            "%Y%m%d%H%M%S", time.localtime())
        abs_path = os.path.join(tmp_directory, filename)

        if not os.path.isdir(tmp_directory):
            os.mkdir(tmp_directory)

        file_desc = open(abs_path, "w")
        file_desc.write('\n'.join(output))
        file_desc.close()

        #cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path)
        #os.system(cmd)

    return keywords

Example #2

0

Show file

def _analyze_documents(records, ontology, collection):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding ontology."""
    global _INDEX

    if not records:
        # No records could be found.
        write_message("WARNING: No record were found in collection %s." %
                      collection,
                      stream=sys.stderr,
                      verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files()
        output.append('<record>')
        output.append('<controlfield tag="001">%s</controlfield>' % record)
        for doc in bibdocfiles:
            # Get the keywords for each PDF document contained in the record.
            if is_pdf(doc.get_full_path()):
                write_message('INFO: Generating keywords for record %d.' %
                              record,
                              stream=sys.stderr,
                              verbose=3)
                fulltext = doc.get_full_path()

                output.append(
                    output_keywords_for_local_file(
                        fulltext,
                        taxonomy=ontology,
                        output_mode="marcxml",
                        output_limit=3,
                        match_mode="partial",
                        with_author_keywords=True,
                        verbose=task_get_option('verbose')))

        _INDEX += 1

        output.append('</record>')

        task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER))
        task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)

Example #3

0

Show file

File: bibclassify_webinterface.py Project: lbjay/cds-invenio

def generate_keywords(req, recid, store_keywords=True):
    req.write("Please be patient while the keywords classification is running...")

    bibdocfiles = BibRecDocs(recid).list_latest_files()

    keywords = []
    for doc in bibdocfiles:
        # Get the keywords for each PDF document contained in the record.
        if is_pdf(doc.get_full_path()):
            fulltext = doc.get_full_path()
            from invenio.bibclassify_engine import get_keywords_from_local_file
            single_keywords, composite_keywords = get_keywords_from_local_file(fulltext,
                taxonomy='HEP', with_author_keywords=True)

            for keyword, spans in single_keywords.items():
                keywords.append([keyword.concept, len(spans)])
            for keyword, num, components in composite_keywords:
                keywords.append([keyword.concept, num])

    if keywords and store_keywords:
        output = ['<collection><record>\n'
                  '<controlfield tag="001">%s</controlfield>' % recid]

        output.append(output_marc(single_keywords, composite_keywords,
        spires=False, taxonomy='HEP'))

        output.append('</record></collection>')

        tmp_directory = "%s/bibclassify" % CFG_TMPDIR
        filename = "bibclassifyd_%s.xml" % time.strftime("%Y%m%d%H%M%S",
            time.localtime())
        abs_path = os.path.join(tmp_directory, filename)

        if not os.path.isdir(tmp_directory):
            os.mkdir(tmp_directory)

        file_desc = open(abs_path, "w")
        file_desc.write('\n'.join(output))
        file_desc.close()

        #cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, abs_path)
        #os.system(cmd)

    return keywords

Example #4

0

Show file

File: bibclassify_daemon.py Project: lbjay/cds-invenio

def _analyze_documents(records, ontology, collection):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding ontology."""
    global _INDEX

    if not records:
        # No records could be found.
        write_message("WARNING: No record were found in collection %s." %
            collection, stream=sys.stderr, verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files()
        output.append('<record>')
        output.append('<controlfield tag="001">%s</controlfield>' % record)
        for doc in bibdocfiles:
            # Get the keywords for each PDF document contained in the record.
            if is_pdf(doc.get_full_path()):
                write_message('INFO: Generating keywords for record %d.' %
                    record, stream=sys.stderr, verbose=3)
                fulltext = doc.get_full_path()

                output.append(output_keywords_for_local_file(fulltext,
                    taxonomy=ontology, output_mode="marcxml", output_limit=3,
                    match_mode="partial", with_author_keywords=True,
                    verbose=task_get_option('verbose')))

        _INDEX += 1

        output.append('</record>')

        task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER))
        task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)

Example #5

0

Show file

def _analyze_documents(
        records,
        taxonomy_name,
        collection,
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message(
            "WARNING: No records were found in collection %s." % collection,
            stream=sys.stderr,
            verbose=2)
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files(
        )  # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None

        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if bibclassify_text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message(
                    'INFO: Generating keywords for record %d.' % record,
                    stream=sys.stderr,
                    verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = \
                    bibclassify_engine.get_keywords_from_local_file(fulltext,
                    taxonomy_name, with_author_keywords=True, output_mode="raw",
                    output_limit=output_limit, match_mode='partial')
            else:
                bibtask.write_message(
                    'WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it.' %
                    (doc.fullpath, doc.doctype),
                    stream=sys.stderr,
                    verbose=3)

            if single_keywords or composite_keywords:
                cleaned_single = bibclassify_engine.clean_before_output(
                    single_keywords)
                cleaned_composite = bibclassify_engine.clean_before_output(
                    composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append('<record>')
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(
                bibclassify_engine._output_marc(
                    keywords.items(), (),
                    akws,
                    acro,
                    spires=bconfig.CFG_SPIRES_FORMAT))
            output.append('</record>')
        else:
            bibtask.write_message('WARNING: No keywords found for record %d.' %
                                  record,
                                  stream=sys.stderr,
                                  verbose=0)

        _INDEX += 1

        bibtask.task_update_progress('Done %d out of %d.' %
                                     (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return '\n'.join(output)

Example #6

0

Show file

File: bibclassify_daemon.py Project: adsabs/invenio

def _analyze_documents(records, taxonomy_name, collection, output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER):
    """For each collection, parse the documents attached to the records
    in collection with the corresponding taxonomy_name.
    @var records: list of recids to process
    @var taxonomy_name: str, name of the taxonomy, e.g. HEP
    @var collection: str, collection name
    @keyword output_limit: int, max number of keywords to extract [3]
    @return: str, marcxml output format of results
    """
    global _INDEX

    if not records:
        # No records could be found.
        bibtask.write_message(
            "WARNING: No records were found in collection %s." % collection, stream=sys.stderr, verbose=2
        )
        return False

    # Process records:
    output = []
    for record in records:
        bibdocfiles = BibRecDocs(record).list_latest_files()  # TODO: why this doesn't call list_all_files() ?
        keywords = {}
        akws = {}
        acro = {}
        single_keywords = composite_keywords = author_keywords = acronyms = None

        for doc in bibdocfiles:
            # Get the keywords for all PDF documents contained in the record.
            if bibclassify_text_extractor.is_pdf(doc.get_full_path()):
                bibtask.write_message("INFO: Generating keywords for record %d." % record, stream=sys.stderr, verbose=3)
                fulltext = doc.get_path()

                single_keywords, composite_keywords, author_keywords, acronyms = bibclassify_engine.get_keywords_from_local_file(
                    fulltext,
                    taxonomy_name,
                    with_author_keywords=True,
                    output_mode="raw",
                    output_limit=output_limit,
                    match_mode="partial",
                )
            else:
                bibtask.write_message(
                    "WARNING: BibClassify does not know how to process \
                    doc: %s (type: %s) -- ignoring it."
                    % (doc.fullpath, doc.doctype),
                    stream=sys.stderr,
                    verbose=3,
                )

            if single_keywords or composite_keywords:
                cleaned_single = bibclassify_engine.clean_before_output(single_keywords)
                cleaned_composite = bibclassify_engine.clean_before_output(composite_keywords)
                # merge the groups into one
                keywords.update(cleaned_single)
                keywords.update(cleaned_composite)
            acro.update(acronyms)
            akws.update(author_keywords)

        if len(keywords):
            output.append("<record>")
            output.append('<controlfield tag="001">%s</controlfield>' % record)
            output.append(
                bibclassify_engine._output_marc(keywords.items(), (), akws, acro, spires=bconfig.CFG_SPIRES_FORMAT)
            )
            output.append("</record>")
        else:
            bibtask.write_message("WARNING: No keywords found for record %d." % record, stream=sys.stderr, verbose=0)

        _INDEX += 1

        bibtask.task_update_progress("Done %d out of %d." % (_INDEX, _RECIDS_NUMBER))
        bibtask.task_sleep_now_if_required(can_stop_too=False)

    return "\n".join(output)