Python clean_article_as_stringの例、defoe.papers.query_utils.clean_article_as_string Pythonの例

コード例 #1

0

ファイルを表示

def do_query(issues, config_file=None, logger=None, context=None):
    print('Loading config')
    with open(config_file, "r") as f:
        config = yaml.load(f)
    print(f'config: {config}')

    if sys.platform == "linux":
        os_type = "sys-i386-64"
    else:
        os_type = "sys-i386-snow-leopard"
    print(f'platform: {sys.platform}')

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    clean_articles = issues.flatMap(lambda issue: [(
        issue.date.year, clean_article_as_string(
            article, defoe_path, os_type)) for article in issue.articles])

    preprocessed_articles = clean_articles.map(lambda cl_article: (cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type)))

    # [(year, article_string)
    filter_articles = preprocessed_articles.filter(
        lambda year_article: any(k in year_article[1] for k in keywords))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_sentences_list_matches(year_article[1], keywords)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda a: (a[0], [{x[0]: x[1]} for x in a[1]])) \
        .collect()
    return result

コード例 #2

0

ファイルを表示

ファイル: target_keysearch_by_year_filter_date.py プロジェクト: akrause2014/defoe-1

def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts the number of occurrences using a list of  keywords or keysentences and 
    filtering those by date. Results are grouped by year.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts. 
    
    For indicating the number of target words to use from the lexicon file, we can indicate it 
    in the configuration file as, num_target: 1. That means, that we have only one word/sentence
    as the target word (the first one). 
    
    If we want to include the target words in the lexicon, we should indicate in 
    the configuration file as, lexicon_start: 0.
    
    If we do not want to include the target words (lets image that we have just one target word) 
    in the lexicon, we should indicate in the configuration file as, lexicon_start: 1.
    
    Finally, to select the dates that we want to use in this query, we have to indicate them
    in the configuration file as follows:
    
      start_year: YEAR_START (including that year)
      end_year: YEAR_FINISH (including that year)

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    start_year = int(config["start_year"])
    end_year = int(config["end_year"])
    num_target = int(config["num_target"])
    lexicon_start = int(config["lexicon_start"])

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]

    target_sentences = keysentences[0:num_target]
    keysentences = keysentences[lexicon_start:]
    clean_articles = issues.flatMap(lambda issue: [(
        issue.date.year, clean_article_as_string(article, defoe_path, os_type)
    ) for article in issue.articles if int(
        issue.date.year) >= start_year and int(issue.date.year) <= end_year])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(lambda cl_article: [(cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type))])

    # [(year, clean_article_string)
    target_articles = t_articles.filter(lambda year_article: any(
        target_s in year_article[1] for target_s in target_sentences))

    # [(year, clean_article_string)
    filter_articles = target_articles.filter(lambda year_article: any(
        keysentence in year_article[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_articles_list_matches(year_article[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result

コード例 #3

0

ファイルを表示

ファイル: key_target_articletitles_by_year.py プロジェクト: akrause2014/defoe-1

def do_query(issues, config_file=None, logger=None, context=None):

    print('Loading config')
    with open(config_file, "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    print(f'config: {config}')

    if sys.platform == "linux":
        os_type = "sys-i386-64"
    else:
        os_type = "sys-i386-snow-leopard"
    print(f'platform: {sys.platform}')

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    unproc_targetwords = config['targetwords']
    targetwords = []
    for t in unproc_targetwords:
        targetwords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in t.split()
        ]))
    print(f'targetwords: {targetwords}')

    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    preprocessed_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)
    filter_articles = preprocessed_articles.filter(
        lambda year_article: any(t in year_article[3] for t in targetwords))

    # [(year, [keysentence, keysentence]), ...]
    matching_articles = filter_articles.flatMap(
        lambda year_article: [(year_article[0], year_article[1], year_article[
            2], k) for k in find_matches(year_article[3], keywords)])

    matching_data = matching_articles.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[2].title_string,
            "article_id": sentence_data[2].article_id,
            "page_ids": list(sentence_data[2].page_ids),
            "section": sentence_data[2].ct,
            "keyword": sentence_data[3],
            "targets": list(targetwords),
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename
        }))

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result

コード例 #4

0

ファイルを表示

ファイル: keysearch_by_year.py プロジェクト: alan-turing-institute/defoe

def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by year.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords/keysentences and words in articles are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, clean_article_as_string(article))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(lambda cl_article: [(cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type))])
    # [(year, clean_article_string)
    filter_articles = t_articles.filter(lambda year_article: any(
        keysentence in year_article[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_sentences_list_matches(year_article[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result

コード例 #5

0

ファイルを表示

ファイル: target_keysearch_by_year_preprocessed_details.py プロジェクト: akrause2014/defoe-1

def do_query(issues, config_file=None, logger=None, context=None):
    """
    Select the articles text along with metadata by using a list of 
    keywords or keysentences and groups by year. In the result file we
    store the preproessed text instead of the original text. 
    
    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts. 
    
    For indicating the number of target words to use from the lexicon file, we can indicate it 
    in the configuration file as, num_target: 1. That means, that we have only one word/sentence
    as the target word (the first one). 
    
    If we want to include the target words in the lexicon, we should indicate in 
    the configuration file as, lexicon_start: 0.
    
    If we do not want to include the target words (lets image that we have just one target word) 
    in the lexicon, we should indicate in the configuration file as, lexicon_start: 1.

    Returns result of form:

        {
          <YEAR>:
          [
            [- article_id: 
             - authors:
             - filename:
             - issue_id:
             - page_ids:
             - preprocessed_text:
             - term
             - title ]
            ...
          ],
          <YEAR>:
          ...
        }
        


    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    num_target = int(config["num_target"])
    lexicon_start = int(config["lexicon_start"])
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]

    target_sentences = keysentences[0:num_target]
    keysentences = keysentences[lexicon_start:]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)
    target_articles = t_articles.filter(lambda year_article: any(
        target_s in year_article[3] for target_s in target_sentences))

    # [(year, clean_article_string)
    filter_articles = target_articles.filter(lambda year_article: any(
        keysentence in year_article[3] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count

    matching_articles = filter_articles.map(lambda year_article: (
        year_article[0], year_article[1], year_article[2], year_article[3],
        get_articles_list_matches(year_article[3], keysentences)))

    matching_sentences = matching_articles.flatMap(
        lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], year_sentence[3], sentence)\
                                for sentence in year_sentence[4]])

    matching_data = matching_sentences.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[2].title_string,
            "article_id:": sentence_data[2].article_id,
            "authors:": sentence_data[2].authors_string,
            "page_ids": list(sentence_data[2].page_ids),
            "term": sentence_data[4],
            "preprocessed text": sentence_data[3],
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename
        }))

    # [(date, {"title": title, ...}), ...]
    # =>

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result

コード例 #6

0

ファイルを表示

ファイル: keysearch_by_year.py プロジェクト: akrause2014/defoe-1

def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by year.
    
    This query is the recommended to use when there are not target words.  

    config_file must be the path to a lexicon file with a list
    of the keywords to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

   

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]
    clean_articles = issues.flatMap(lambda issue: [(
        issue.date.year, clean_article_as_string(
            article, defoe_path, os_type)) for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(lambda cl_article: [(cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type))])
    # [(year, clean_article_string)
    filter_articles = t_articles.filter(lambda year_article: any(
        keysentence in year_article[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_sentences_list_matches(year_article[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result

コード例 #7

0

ファイルを表示

ファイル: key_articledetails_by_year.py プロジェクト: akrause2014/defoe-1

def do_query(issues, config_file=None, logger=None, context=None):
    """
    Select the articles text along with metadata by using a list of 
    keywords or keysentences and groups by year.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.

    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts. 

    Returns result of form:

        {
          <YEAR>:
          [
            [- article_id: 
             - authors:
             - filename:
             - issue_id:
             - page_ids:
             - text:
             - term
             - title ]
            ...
          ],
          <YEAR>:
          ...
        }


    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)

    os_type = "sys-i386-64"
    if sys.platform == 'linux':
        os_type = "sys-i386-64"
    elif sys.platform == 'darwin':
        os_type = "sys-i386-snow-leopard"

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)
    filter_articles = t_articles.filter(
        lambda year_article: any(k in year_article[3] for k in keywords))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count

    matching_articles = filter_articles.map(
        lambda year_article: (year_article[0], year_article[1], year_article[
            2], get_articles_list_matches(year_article[3], keywords)))

    #   matching_sentences = matching_articles.flatMap(
    #       lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], sentence)\
    #                               for sentence in year_sentence[3]])

    matching_data = matching_articles.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[2].title_string,
            "article_id:": sentence_data[2].article_id,
            "authors:": sentence_data[2].authors_string,
            "page_ids": list(sentence_data[2].page_ids),
            "section": sentence_data[2].ct,
            "term": sentence_data[3],
            "original text": sentence_data[2].words_string,
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename
        }))

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result

コード例 #8

0

ファイルを表示

def do_query(issues, config_file=None, logger=None, context=None):
    """
    Select the articles text along with metadata by using a list of 
    keywords or keysentences and groups by year.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.

    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts.


    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """

    print('Loading config')
    with open(config_file, "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    print(f'config: {config}')

    if sys.platform == "linux":
        os_type = "sys-i386-64"
    else:
        os_type = "sys-i386-snow-leopard"
    print(f'platform: {sys.platform}')

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    distance = int(config['distance'])

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    unproc_controlwords = config['controlwords']
    controlwords = []
    for c in unproc_controlwords:
        controlwords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in c.split()
        ]))
    print(f'controlwords: {controlwords}')

    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)]
    filter_articles = t_articles.filter(
        lambda year_article: any(k in year_article[3] for k in keywords))

    matching_articles = filter_articles.flatMap(
        lambda year_article: [(year_article[0], year_article[
            1], year_article[2], w) for w in within_distance(
                keywords, controlwords, year_article[3], distance)])

    matching_data = matching_articles.map(
        lambda sentence_data: (sentence_data[0], {
            "article_id": sentence_data[2].article_id,
            "page_ids": list(sentence_data[2].page_ids),
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename,
            "original text": sentence_data[2].words_string,
            "keywords": list(sentence_data[3][1]),
            "controls": list(sentence_data[3][0]),
        }))

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result