def do_query(archives, config_file=None, logger=None, context=None):
    """
    Ingest NLS pages, applies all 4 preprocess treatments (none, normalize, lemmatize, stem) to each page, and save them to PostgreSQL table, with some metadata associated with each page.
    Metadata collected: tittle, edition, year, place, archive filename, page filename, page id, num pages, 
    type of archive, model, source_text_raw, source_text_norm, source_text_lemmatize, source_text_stem, num_page_words

    Data is saved as Dataframes into PostgreSQL table 

    Example:
    ('Encyclopaedia Britannica,"Seventh edition, Volume 13, LAB-Magnetism",1842,Edinburgh,/mnt/lustre/at003/at003/rfilguei2/nls-data-encyclopaediaBritannica/193108323,alto/193201394.34.xml,page,Page9,810,book,nls,"THE ENCYCLOPAEDIA BRITANNICA DICTIONARY OF ARTS, SCIENCES, AND GENERAL LITERATURE. SEVENTH EDITION, i WITH PRELIMINARY DISSERTATIONS ON THE HISTORY OF THE SCIENCES, AND OTHER EXTENSIVE IMPROVEMENTS AND ADDITIONS; INCLUDING THE LATE SUPPLEMENT. A GENERAL INDEX, AND NUMEROUS ENGRAVINGS. VOLUME XIII. ADAM AND CHARLES BLACK, EDINBURGH; M.DCCC.XLII.","THE ENCYCLOPAEDIA BRITANNICA DICTIONARY OF ARTS, SCIENCES, AND GENERAL LITERATURE. SEVENTH EDITION, i WITH PRELIMINARY DISSERTATIONS ON THE HISTORY OF THE SCIENCES, AND OTHER EXTENSIVE IMPROVEMENTS AND ADDITIONS; INCLUDING THE LATE SUPPLEMENT. A GENERAL INDEX, AND NUMEROUS ENGRAVINGS. VOLUME XIII. ADAM AND CHARLES BLACK, EDINBURGH; M.DCCC.XLII.",the encyclopaedia britannica dictionary of arts sciences and general literature seventh edition i with preliminary dissertations on the history of the sciences and other extensive improvements and additions including the late supplement a general index and numerous engravings volume xiii adam and charles black edinburgh mdcccxlii,the encyclopaedia britannica dictionary of art science and general literature seventh edition i with preliminary dissertation on the history of the science and other extensive improvement and addition including the late supplement a general index and numerous engraving volume xiii adam and charles black edinburgh mdcccxlii,the encyclopaedia britannica dictionari of art scienc and gener literatur seventh edit i with preliminari dissert on the histori of the scienc and other extens improv and addit includ the late supplement a gener index and numer engrav volum xiii adam and charl black edinburgh mdcccxlii,46')

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """
    
    preprocess_none = query_utils.parse_preprocess_word_type("none")
    preprocess_normalize = query_utils.parse_preprocess_word_type("normalize")
    preprocess_lemmatize = query_utils.parse_preprocess_word_type("lemmatize")
    preprocess_stem = query_utils.parse_preprocess_word_type("stem")
    text_unit = "page"
    # [(tittle, edition, year, place, archive filename, page filename, 
    #   page id, num pages, type of archive, type of disribution, model)]
    documents = archives.flatMap(
        lambda archive: [(document.title, document.edition, document.year, \
                          document.place, document.archive.filename, document.num_pages, \
                           document.document_type, document.model, document) for document in list(archive)])
    # [(tittle, edition, year, place, archive filename, page filename, text_unit, text_unit_id, 
    #   num_text_unit, type of archive, type of disribution, model, raw_page, clean_page, num_words)]
    pages_clean = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                               year_document[3], year_document[4], page.code, text_unit, page.page_id, \
                               year_document[5], year_document[6], year_document[7], get_page_as_string(page, preprocess_none), \
                               clean_page_as_string(page), len(page.words)) for page in year_document[8]])
    # [(tittle, edition, year, place, archive filename, page filename, text_unit, text_unit_id, 
    #   num_text_unit, type of archive, type of disribution, model, raw_page, clean_page, clean_norm_page, clean_lemma_page, clean_stemm_page, num_words)]
    pages = pages_clean.flatMap(
        lambda clean_page: [(clean_page[0], clean_page[1], clean_page[2],\
                               clean_page[3], clean_page[4], clean_page[5], clean_page[6], clean_page[7], \
                               clean_page[8], clean_page[9], clean_page[10], clean_page[11],\
                               clean_page[12], preprocess_clean_page(clean_page[12], preprocess_normalize),\
                               preprocess_clean_page(clean_page[12], preprocess_lemmatize), preprocess_clean_page(clean_page[12], preprocess_stem), clean_page[13])])

    nlsRow=Row("title",  "edition", "year", "place", "archive_filename",  "source_text_filename", "text_unit", "text_unit_id", "num_text_unit", "type_archive", "model", "source_text_raw", "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem", "num_words")
   
    sqlContext = SQLContext(context)
    df = sqlContext.createDataFrame(pages,nlsRow)
    with open(config_file, "r") as f:
        config = yaml.load(f)
    url = "jdbc:postgresql://%s:%s/%s" % (config["host"],config["port"],config["database"])
    properties = {"user": config["user"] ,"driver": config["driver"]}
    
    mode = "overwrite"
    df.write.jdbc(url=url, table=config["table"], mode=mode, properties=properties)
    return "0"
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words, for keywords and groups by date.

    Data in ES have the following colums:

    "title",  "edition", "year", "place", "archive_filename", 
    "source_text_filename", "text_unit", "text_unit_id", 
    "num_text_unit", "type_archive", "model", "source_text_raw", 
    "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem",
    "num_words"

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:
          [(year, [(title, edition, archive_filename, filename, word,corcondance),
              (title, edition, archive_filename, filename, word, concordance ), ...]), ...]


    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    window = 20
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.source_text_clean)

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    # [(year, document, title, edition, archive_filename), ...]
    documents = archives.flatMap(lambda archive: [(
        document.year, document, document.title, document.edition, document.
        archive.filename) for document in list(archive)])
    # [(year, page_string)

    #(year, title, edition, archive_filename, page_code, clean_page_string)
    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], year_document[2], year_document[3], year_document[
            4], page.code, clean_page_as_string(
                page, defoe_path, os_type)) for page in year_document[1]])

    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    pages = clean_pages.flatMap(lambda cl_page: [(
        cl_page[0], cl_page[1], cl_page[2], cl_page[3], cl_page[4],
        preprocess_clean_page(cl_page[5], preprocess_type))])
    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[5] for keysentence in keysentences))

    # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...]
    maching_idx = filter_pages.map(lambda year_page: (
        (year_page[0], year_page[1], year_page[2], year_page[3], year_page[4],
         year_page[5], get_text_keyword_idx(year_page[5], keysentences))))

    # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])]
    concordance_words = maching_idx.flatMap(
        lambda year_idx: [
            (year_idx[0],
                {"title":year_idx[1], "edition": year_idx[2], "archive_filename": year_idx[3], "filename":year_idx[4], "term": word_idx[0],\
                 "snippet": get_concordance_string(year_idx[5], word_idx[0], word_idx[1], window)})\
                 for word_idx in year_idx[6]])

    result = concordance_words.groupByKey() \
        .map(lambda year_match:
             (year_match[0], list(year_match[1]))) \
        .collect()
    return result
Esempio n. 3
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by words.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system.

    Returns result of form:

        {
          <WORD>:
          [
            [<YEAR>, <NUM_WORDS>],
            ...
          ],
          <WORD>:
          ...

        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [(year, page_string)

    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], clean_page_as_string(page, defoe_path, os_type)) for page in
                                                           year_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[
        0], preprocess_clean_page(cl_page[1], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(lambda year_page: (year_page[
        0], get_sentences_list_matches(year_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(keysentence, (year, num_keysentences)), ...]
    # =>
    # [(keysentence, [year, num_keysentences]), ...]
    result = matching_sentences \
        .reduceByKey(add) \
        .map(lambda yearsentence_count:
             (yearsentence_count[0][1],
              (yearsentence_count[0][0], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Esempio n. 4
0
def do_query(df, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words, for keywords and groups by date.

    Data in HDFS have the following colums:
    
    "title",  "edition", "year", "place", "archive_filename",  "source_text_filename", "text_unit", 
    "text_unit_id", "num_text_unit", "type_archive", "model", "type_page", "header", "term", "definition",
    "num_articles", "num_page_words", "num_article_words", 

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

      Returns result of form:
        {
          <YEAR>:
          [
            [- title: 
             - edition:
             - archive_filename:
             - page number:
             - header:
             - term:
             - article:
             - article-definition: ], 
             [], 
            ...
         
          <YEAR>:
          ...
        }
  
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    # Filter out the pages that are null, which model is nls, and select only 2 columns: year and the page as string (either raw or preprocessed).
    fdf = df.withColumn("definition", blank_as_null("definition"))
    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text)
    newdf = fdf.filter(fdf.definition.isNotNull()).filter(
        fdf["model"] == "nlsArticles").select(fdf.year, fdf.title, fdf.edition,
                                              fdf.archive_filename,
                                              fdf.source_text_filename,
                                              fdf.text_unit_id, fdf.type_page,
                                              fdf.header, fdf.term,
                                              fdf.definition)
    articles = newdf.rdd.map(tuple)

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, article, preprocess_article, clean_article)

    preprocess_articles = articles.flatMap(lambda t_articles: [(
        t_articles[0], t_articles[1], t_articles[2], t_articles[3], t_articles[
            4], t_articles[5], t_articles[6], t_articles[7], t_articles[8],
        preprocess_clean_page(t_articles[9], preprocess_type), t_articles[9])])

    filter_articles = preprocess_articles.filter(lambda year_page: any(
        keysentence in year_page[9] for keysentence in keysentences))

    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text, list_sentences)
    matching_articles = filter_articles.map(lambda year_article: (
        year_article[0], year_article[1], year_article[2], year_article[
            3], year_article[4], year_article[5], year_article[
                6], year_article[7], year_article[8], year_article[10],
        get_articles_list_matches(year_article[9], keysentences)))

    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text, sentence)
    matching_sentences = matching_articles.flatMap(
        lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], year_sentence[3],
                                year_sentence[4], year_sentence[5], year_sentence[6], year_sentence[7],
                                year_sentence[8], year_sentence[9], sentence)\
                                for sentence in year_sentence[10]])

    matching_data = matching_sentences.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[1],
            "edition": sentence_data[2],
            "archive_filename": sentence_data[3],
            "filename": sentence_data[4],
            "page number": sentence_data[5],
            "type_page": sentence_data[6],
            "header": sentence_data[7],
            "term": sentence_data[10],
            "article": sentence_data[8],
            "article-definition": sentence_data[9]
        }))

    # [(date, {"title": title, ...}), ...]
    # =>

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    The query counts as a “hit” every time that finds a page with a particular 
    term from a lexicon and it groups the results by years.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

    If a term appears several times in the same page, it will be still counted as “1”.
            Example:
            1795:
            - - kail
              - 1
            - - aff
              - 4
            - - lairds
              - 1
    That means that kail appears in 1 page , aff in 4 pages and lairds in 1 page across all the books in the year 1795.

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type= "sys-i386-snow-leopard"
    else:
            os_type = "sys-i386-64"
    if "defoe_path" in config :
        defoe_path= config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config, os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [query_utils.preprocess_word(
                word, preprocess_type) for word in k_split]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(
        lambda archive: [(document.year, document) for document in list(archive)])
    # [(year, page_string)
    
    
    clean_pages = documents.flatMap(
        lambda year_document: [(year_document[0],  
                                    clean_page_as_string(page, defoe_path, os_type)) 
                                       for page in year_document[1]])
    pages = clean_pages.flatMap(
        lambda cl_page: [(cl_page[0], 
                                    preprocess_clean_page(cl_page[1], preprocess_type))]) 
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(
        lambda year_page: any(
            keysentence in year_page[1] for keysentence in keysentences))
    
    
    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(
        lambda year_page: (year_page[0],
                              get_sentences_list_matches(
                                  year_page[1],
                                  keysentences)))
    

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(
        lambda year_sentence: [((year_sentence[0], sentence), 1)
                               for sentence in year_sentence[1]])


    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Esempio n. 6
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by year.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords/keysentences and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [(year, page_string)

    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], clean_page_as_string(page)) for page in year_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[
        0], preprocess_clean_page(cl_page[1], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(lambda year_page: (year_page[
        0], get_sentences_list_matches(year_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Esempio n. 7
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    The query counts as a “hint” every time that finds a term from our lexicon
    and group the results by books.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

         -  '''Twas on the morn of sweet May Day':
                - - neu
                     - 1
                - - blaw
                     - 5
     That means that neu appears in once of the book 'Twas on the morn of sweet May Day'. 
     And blaw appears 5 times in the same book. 

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped bytitle
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(
        document.title, document) for document in list(archive)])

    # [(year, page_string)

    clean_pages = documents.flatMap(lambda title_document: [(title_document[
        0], clean_page_as_string(page, defoe_path, os_type)) for page in
                                                            title_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[
        0], preprocess_clean_page(cl_page[1], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda title_page: any(
        keysentence in title_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(lambda title_page: (title_page[
        0], get_sentences_list_matches_per_page(title_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda title_sentence: [(
        (title_sentence[0], sentence), 1) for sentence in title_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda titlesentence_count:
             (titlesentence_count[0][0],
              (titlesentence_count[0][1], titlesentence_count[1]))) \
        .groupByKey() \
        .map(lambda title_sentencecount:
             (title_sentencecount[0], list(title_sentencecount[1]))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words (here it is configured to 40), for keywords and groups by date.
    Store the snippet (40 words before and after each term). 

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 
    
    
    Returns result of form:
        {
          <YEAR>:
          [
            [- archive_filename: 
             - edition:
             - filename:
             - snippet:
             - term
             - title ]
            ...
          ],
          <YEAR>:
          ...
        }
  


    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    window = 40
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    # [(year, document, title, edition, archive_filename), ...]
    documents = archives.flatMap(lambda archive: [(
        document.year, document, document.title, document.edition, document.
        archive.filename) for document in list(archive)])
    # [(year, page_string)

    #(year, title, edition, archive_filename, page_code, clean_page_string)
    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], year_document[2], year_document[3], year_document[
            4], page.code, clean_page_as_string(
                page, defoe_path, os_type)) for page in year_document[1]])

    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    pages = clean_pages.flatMap(lambda cl_page: [(
        cl_page[0], cl_page[1], cl_page[2], cl_page[3], cl_page[4],
        preprocess_clean_page(cl_page[5], preprocess_type))])
    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[5] for keysentence in keysentences))

    # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...]
    maching_idx = filter_pages.map(lambda year_page: (
        (year_page[0], year_page[1], year_page[2], year_page[3], year_page[4],
         year_page[5], get_text_keysentence_idx(year_page[5], keysentences))))

    # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])]
    concordance_words = maching_idx.flatMap(
        lambda year_idx: [
            (year_idx[0],
                {"title":year_idx[1], "edition": year_idx[2], "archive_filename": year_idx[3], "filename":year_idx[4], "term": word_idx[0],\
                 "snippet": get_concordance_string(year_idx[5], word_idx[0], word_idx[1], window)})\
                 for word_idx in year_idx[6]])

    result = concordance_words.groupByKey() \
        .map(lambda year_match:
             (year_match[0], list(year_match[1]))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets the concordance (also called details) occurrences of keywords or keysentences and groups by year.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system.

    Returns result of form:

        {
          <YEAR>:
          [
            - [title:
             place:
             publisher:
             snippet:
             term:
             document_id:
             filenanme]
            - []
            ...
          ],
          <YEAR>:
          ...
        }
        
       

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [(year, page_string)

    clean_pages = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[
            1], page, clean_page_as_string(page, defoe_path, os_type))
                               for page in year_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[0], cl_page[
        1], cl_page[2], preprocess_clean_page(cl_page[3], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(
        lambda year_page: (year_page[0], year_page[1], year_page[
            2], get_sentences_list_matches(year_page[3], keysentences)))

    matching_sentences = matching_pages.flatMap(
        lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], sentence)\
                                for sentence in year_sentence[3]])

    matching_data = matching_sentences.map(
        lambda page_data: (page_data[0], {
            "title": page_data[1].title,
            "place": page_data[1].place,
            "publisher": page_data[1].publisher,
            "page_number": page_data[2].code,
            "snippet": page_data[2].content,
            "term": page_data[3],
            "document_id": page_data[1].code,
            "filename": page_data[1].archive.filename
        }))


    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result