Python text_lines_from_urlの例、bibclassify_text_extractor.text_lines_from_url Pythonの例

コード例 #1

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: pombredanne/invenio-old

def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text",
    output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
    **kwargs):
    """Outputs the keywords for each source in sources."""


    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print "Input file: %s" % source

        output = get_keywords_from_text(text_lines,
                                          taxonomy_name,
                                          output_mode=output_mode,
                                          output_limit=output_limit,
                                          spires=spires,
                                          match_mode=match_mode,
                                          no_cache=no_cache,
                                          with_author_keywords=with_author_keywords,
                                          rebuild_cache=rebuild_cache,
                                          only_core_tags=only_core_tags,
                                          extract_acronyms=extract_acronyms
                                          )
        print output

    # Get the fulltext for each source.
    for entry in input_sources:
        log.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines = extractor.text_lines_from_local_file(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines = extractor.text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            text_lines = extractor.text_lines_from_url(entry,
                user_agent=bconfig.CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()

コード例 #2

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: pombredanne/invenio

def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords

コード例 #3

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: metandrey/invenio-metandrey

def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords

コード例 #4

0

ファイルを表示

def output_keywords_for_sources(
        input_sources,
        taxonomy_name,
        output_mode="text",
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
        spires=False,
        match_mode="full",
        no_cache=False,
        with_author_keywords=False,
        rebuild_cache=False,
        only_core_tags=False,
        extract_acronyms=False,
        **kwargs):
    """Outputs the keywords for each source in sources."""

    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print "Input file: %s" % source

        output = get_keywords_from_text(
            text_lines,
            taxonomy_name,
            output_mode=output_mode,
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms)
        print output

    # Get the fulltext for each source.
    for entry in input_sources:
        log.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines = extractor.text_lines_from_local_file(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines = extractor.text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            text_lines = extractor.text_lines_from_url(
                entry, user_agent=make_user_agent_string("BibClassify"))
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()