Example #1
0
def arxiv_fft_get(obj, eng):
    """Get FFT from arXiv, if arXiv ID is provided."""
    deposition = Deposition(obj)
    sip = deposition.get_latest_sip(sealed=False)
    metadata = sip.metadata

    if 'arxiv_id' in metadata and metadata['arxiv_id']:
        arxiv_pdf_url = cfg.get("ARXIV_PDF_URL", "http://arxiv.org/pdf/") + \
            "{0}.{1}"

        from invenio.config import CFG_TMPSHAREDDIR
        arxiv_file, arxiv_file_path = mkstemp(
            prefix="%s_" % (metadata['arxiv_id'].replace("/", "_")),
            suffix='.pdf',
            dir=CFG_TMPSHAREDDIR,
        )
        os.close(arxiv_file)

        download_url(url=arxiv_pdf_url.format(metadata['arxiv_id'], "pdf"),
                     content_type="pdf",
                     download_to_file=arxiv_file_path)

        # To get 1111.2222.pdf as filename.
        filename = "{0}.pdf".format(metadata['arxiv_id'].replace("/", "_"))

        try:
            try:
                save_deposition_file(deposition,
                                     filename,
                                     arxiv_file_path)
            except FilenameAlreadyExists:
                obj.log.error("PDF file not saved: filename already exists.")
        except Exception as e:
            obj.log.error("PDF file not saved: {}.".format(e.message))
    def test_content_type(self):
        """Test simple calls to download_url."""
        from invenio_utils.filedownload import (download_url,
                                                InvenioFileDownloadError)
        tmpdoc = download_url("http://duckduckgo.com", content_type="html")
        self.assertTrue(tmpdoc)

        fun = lambda: download_url("http://google.com", content_type="pdf")
        self.assertRaises(InvenioFileDownloadError, fun)
Example #3
0
def output_keywords_for_sources(
        input_sources, taxonomy_name, output_mode="text",
        output_limit=cfg['CLASSIFIER_DEFAULT_OUTPUT_NUMBER'], spires=False,
        match_mode="full", no_cache=False, with_author_keywords=False,
        rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
        **kwargs):
    """Output the keywords for each source in sources."""
    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print("Input file: %s" % source)

        line_nb = len(text_lines)
        word_nb = 0
        for line in text_lines:
            word_nb += len(re.findall("\S+", line))

        current_app.logger.info(
            "Remote file has %d lines and %d words.".format(
                line_nb, word_nb
            )
        )
        return get_keywords_from_text(
            text_lines,
            taxonomy_name,
            output_mode=output_mode,
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms
        )

    # Get the fulltext for each source.
    for entry in input_sources:
        current_app.logger.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if filename.startswith('.'):
                    continue
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines, dummy = get_plaintext_document_body(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines, dummy = get_plaintext_document_body(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            local_file = download_url(entry)
            text_lines, dummy = get_plaintext_document_body(local_file)
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()
Example #4
0
def output_keywords_for_sources(input_sources,
                                taxonomy_name,
                                output_mode="text",
                                output_limit=None,
                                spires=False,
                                match_mode="full",
                                no_cache=False,
                                with_author_keywords=False,
                                rebuild_cache=False,
                                only_core_tags=False,
                                extract_acronyms=False,
                                **kwargs):
    """Output the keywords for each source in sources."""
    if output_limit is None:
        output_limit = cfg['CLASSIFIER_DEFAULT_OUTPUT_NUMBER']

    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print("Input file: %s" % source)

        line_nb = len(text_lines)
        word_nb = 0
        for line in text_lines:
            word_nb += len(re.findall("\S+", line))

        current_app.logger.info(
            "Remote file has %d lines and %d words.".format(line_nb, word_nb))
        return get_keywords_from_text(
            text_lines,
            taxonomy_name,
            output_mode=output_mode,
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms)

    # Get the fulltext for each source.
    for entry in input_sources:
        current_app.logger.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if filename.startswith('.'):
                    continue
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines, dummy = get_plaintext_document_body(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines, dummy = get_plaintext_document_body(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            from invenio_utils.filedownload import download_url
            local_file = download_url(entry)
            text_lines, dummy = get_plaintext_document_body(local_file)
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()