def do_query(archives, config_file=None, logger=None, context=None):
    """
    Ingest NLS pages, applies all 4 preprocess treatments (none, normalize, lemmatize, stem) to each page, and save them to PostgreSQL table, with some metadata associated with each page.
    Metadata collected: tittle, edition, year, place, archive filename, page filename, page id, num pages, 
    type of archive, model, source_text_raw, source_text_norm, source_text_lemmatize, source_text_stem, num_page_words

    Data is saved as Dataframes into PostgreSQL table 

    Example:
    ('Encyclopaedia Britannica,"Seventh edition, Volume 13, LAB-Magnetism",1842,Edinburgh,/mnt/lustre/at003/at003/rfilguei2/nls-data-encyclopaediaBritannica/193108323,alto/193201394.34.xml,page,Page9,810,book,nls,"THE ENCYCLOPAEDIA BRITANNICA DICTIONARY OF ARTS, SCIENCES, AND GENERAL LITERATURE. SEVENTH EDITION, i WITH PRELIMINARY DISSERTATIONS ON THE HISTORY OF THE SCIENCES, AND OTHER EXTENSIVE IMPROVEMENTS AND ADDITIONS; INCLUDING THE LATE SUPPLEMENT. A GENERAL INDEX, AND NUMEROUS ENGRAVINGS. VOLUME XIII. ADAM AND CHARLES BLACK, EDINBURGH; M.DCCC.XLII.","THE ENCYCLOPAEDIA BRITANNICA DICTIONARY OF ARTS, SCIENCES, AND GENERAL LITERATURE. SEVENTH EDITION, i WITH PRELIMINARY DISSERTATIONS ON THE HISTORY OF THE SCIENCES, AND OTHER EXTENSIVE IMPROVEMENTS AND ADDITIONS; INCLUDING THE LATE SUPPLEMENT. A GENERAL INDEX, AND NUMEROUS ENGRAVINGS. VOLUME XIII. ADAM AND CHARLES BLACK, EDINBURGH; M.DCCC.XLII.",the encyclopaedia britannica dictionary of arts sciences and general literature seventh edition i with preliminary dissertations on the history of the sciences and other extensive improvements and additions including the late supplement a general index and numerous engravings volume xiii adam and charles black edinburgh mdcccxlii,the encyclopaedia britannica dictionary of art science and general literature seventh edition i with preliminary dissertation on the history of the science and other extensive improvement and addition including the late supplement a general index and numerous engraving volume xiii adam and charles black edinburgh mdcccxlii,the encyclopaedia britannica dictionari of art scienc and gener literatur seventh edit i with preliminari dissert on the histori of the scienc and other extens improv and addit includ the late supplement a gener index and numer engrav volum xiii adam and charl black edinburgh mdcccxlii,46')

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """
    
    preprocess_none = query_utils.parse_preprocess_word_type("none")
    preprocess_normalize = query_utils.parse_preprocess_word_type("normalize")
    preprocess_lemmatize = query_utils.parse_preprocess_word_type("lemmatize")
    preprocess_stem = query_utils.parse_preprocess_word_type("stem")
    text_unit = "page"
    # [(tittle, edition, year, place, archive filename, page filename, 
    #   page id, num pages, type of archive, type of disribution, model)]
    documents = archives.flatMap(
        lambda archive: [(document.title, document.edition, document.year, \
                          document.place, document.archive.filename, document.num_pages, \
                           document.document_type, document.model, document) for document in list(archive)])
    # [(tittle, edition, year, place, archive filename, page filename, text_unit, text_unit_id, 
    #   num_text_unit, type of archive, type of disribution, model, raw_page, clean_page, num_words)]
    pages_clean = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                               year_document[3], year_document[4], page.code, text_unit, page.page_id, \
                               year_document[5], year_document[6], year_document[7], get_page_as_string(page, preprocess_none), \
                               clean_page_as_string(page), len(page.words)) for page in year_document[8]])
    # [(tittle, edition, year, place, archive filename, page filename, text_unit, text_unit_id, 
    #   num_text_unit, type of archive, type of disribution, model, raw_page, clean_page, clean_norm_page, clean_lemma_page, clean_stemm_page, num_words)]
    pages = pages_clean.flatMap(
        lambda clean_page: [(clean_page[0], clean_page[1], clean_page[2],\
                               clean_page[3], clean_page[4], clean_page[5], clean_page[6], clean_page[7], \
                               clean_page[8], clean_page[9], clean_page[10], clean_page[11],\
                               clean_page[12], preprocess_clean_page(clean_page[12], preprocess_normalize),\
                               preprocess_clean_page(clean_page[12], preprocess_lemmatize), preprocess_clean_page(clean_page[12], preprocess_stem), clean_page[13])])

    nlsRow=Row("title",  "edition", "year", "place", "archive_filename",  "source_text_filename", "text_unit", "text_unit_id", "num_text_unit", "type_archive", "model", "source_text_raw", "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem", "num_words")
   
    sqlContext = SQLContext(context)
    df = sqlContext.createDataFrame(pages,nlsRow)
    with open(config_file, "r") as f:
        config = yaml.load(f)
    url = "jdbc:postgresql://%s:%s/%s" % (config["host"],config["port"],config["database"])
    properties = {"user": config["user"] ,"driver": config["driver"]}
    
    mode = "overwrite"
    df.write.jdbc(url=url, table=config["table"], mode=mode, properties=properties)
    return "0"
Esempio n. 2
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Ingest NLS pages, applies scpaCy NLP pipeline for identifying the possible locations of each page. And applies the edinburgh geoparser for getting the latituted and longitude of each of them.
    Before applying the geoparser, two clean steps are applied - long-S and hyphen words. 
    
    Example:

    ("Descriptive account of the principal towns in Scotland: to accompany Wood's town atlas", '1828', 1828, 'Edinburgh', '/home/tdm/datasets/nls-data-gazetteersOfScotland/97350713', 'alto/97350911.34.xml', 'page', 'Page17', 376, 'book', 'nls', 'CONTENTS. Page. Aberdeen, 1 Annan, 19 Arbroath, 23 Ayr, .--SO Banff, 39 Berwick, 4S Brechin, 55 Crieff, 61 Cupar Fife, • 65 Dalkeith, 70 Dingwall, 76 DunbartorT, • 79 Dundee, 83 Dumfries, <• 91 Dunfermline, 99 Dunkeid, « 105 Edinburgh, -. . 1 1 1 Elgin, . . . ]29 Forfar, -135 Forres, 139 Glasgow, . 117', {}), ("Descriptive account of the principal towns in Scotland: to accompany Wood's town atlas", '1828', 1828, 'Edinburgh', '/home/tdm/datasets/nls-data-gazetteersOfScotland/97350713', 'alto/97350923.34.xml', 'page', 'Page18', 376, 'book', 'nls', 'Xll Greenock, 171 Haddington, 181 Hamilton, 185 Hawick, 191 Inverary, 199 Inverness, . * •> 203 Irvine, * 211 Jedburgh, * * 215 Kelso, 221 Kilmarnock, • 227 Kirkcaldy 233 Kinross, * * 241 Lanark, * 247 Leith, 253 Linlithgow, «• * 265 Montrose, 271 Nairn, 277 Paisley, 281 Peebles, 291 Perth, * 297 Portobello, 309 Rothesay, * 313 Selkirk, > , 319 St Andrews, 323 Stirling, -^331 Stonehaven, * 339 Stornowav, ... Si-5', {('Hamilton', '1'): ('55.77731433348086', '-4.067392672500774'), ('Inverary', '2'): ('56.2333333', '-5.0666667'), ('Inverness', '3'): ('57.47871409771949', '-4.212450527351024'), ('Lanark', '4'): ('55.67483195471274', '-3.775417694605498')}),



    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """
    
    documents = archives.flatMap(
        lambda archive: [(document.title, document.edition, document.year, \
                          document) for document in list(archive)])
    
    # [(tittle, edition, year, xame, page filename, text_unit, text_unit_id, 
    #   num_text_unit, type of archive, type of disribution, model, clean_page)]
    
    pages_clean = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                               page.code, page.page_id, clean_page_as_string(page)) for page in year_document[3]])

    
    geo_xml_pages = pages_clean.flatMap(
        lambda clean_page: [(clean_page[0], clean_page[1], clean_page[2],\
                               clean_page[3], clean_page[4], query_utils.geoparser_cmd(clean_page[5]))])
    
    
    matching_pages = geo_xml_pages.map(
        lambda geo_page:
        (geo_page[0],
         {"edition": geo_page[1],
          "year": geo_page[2], 
          "page_filename": geo_page[3],
          "text_unit id": geo_page[4],
          "lang_model": "geoparser_original",
          "georesolution_page": query_utils.geoparser_coord_xml(geo_page[5])}))

    
    result = matching_pages \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
Esempio n. 3
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Ingest NLS pages, applies all 4 preprocess treatments (none, normalize, lemmatize, stem) to each page, and save them to HDFS CSV files, with some metadata associated with each page.
    Metadata collected: tittle, edition, year, place, archive filename, page filename, page id, num pages, 
    type of archive, model, source_text_raw, source_text_norm, source_text_lemmatize, source_text_stem, num_page_words

    Data is saved as Dataframes into HDFS CSV files 


    Example:
    ('Encyclopaedia Britannica; or, A dictionary of arts, sciences, and miscellaneous literature', 'Fourth edition ...', 
      1810, 'Edinburgh', '/mnt/lustre/at003/at003/rfilguei2/nls-data-encyclopaediaBritannica/191253839', 
      'alto/192209952.34.xml', 'Page5', 446, 'book', 'nls',  u"Part III. MORAL PHILOSOPHY....., u"part iii moral ...", u"part iii moral ...", u"part iii moral...",'46')
    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """

    preprocess_none = query_utils.parse_preprocess_word_type("none")
    preprocess_normalize = query_utils.parse_preprocess_word_type("normalize")
    preprocess_lemmatize = query_utils.parse_preprocess_word_type("lemmatize")
    preprocess_stem = query_utils.parse_preprocess_word_type("stem")
    text_unit = "page"
    # [(tittle, edition, year, place, archive filename, page filename,
    #   page id, num pages, type of archive, type of disribution, model)]
    documents = archives.flatMap(
        lambda archive: [(document.title, document.edition, document.year, \
                          document.place, document.archive.filename, document.num_pages, \
                           document.document_type, document.model, document) for document in list(archive)])
    pages = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                               year_document[3], year_document[4], page.code, text_unit, page.page_id, \
                               year_document[5], year_document[6], year_document[7], get_page_as_string(page, preprocess_none), \
                               get_page_as_string(page, preprocess_normalize), \
                               clean_page_as_string(page),\
                               get_page_as_string(page, preprocess_lemmatize), get_page_as_string(page, preprocess_stem),\
                                len(page.words)) for page in year_document[8]])

    nlsRow = Row("title", "edition", "year", "place", "archive_filename",
                 "source_text_filename", "text_unit", "text_unit_id",
                 "num_text_unit", "type_archive", "model", "source_text_raw",
                 "source_text_clean", "source_text_norm",
                 "source_text_lemmatize", "source_text_stem", "num_words")

    sqlContext = SQLContext(context)
    df = sqlContext.createDataFrame(pages, nlsRow)
    df.write.mode('overwrite').option(
        "header", "true").csv("hdfs:///user/at003/rosa/nls_demo.csv")
    return "0"
Esempio n. 4
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    It ingests NLS pages, applies the original geoparser for identifying the possible locations of each page. 
    And also for getting the latituted and longitude of each location.
    
    Before applying the geoparser, two clean steps are applied - long-S and hyphen words. 
    
    A config_file should be indicated to specify the gazetteer to use, 
    the defoe_path, the bounding box (optional), as well as the operating system. 
    
    Example:
    - 1842:
        - archive: /home/rosa_filgueira_vicente/datasets/sg_simple_sample/97437554
        - edition: 1842, Volume 1
        - georesolution_page:
            - Annan-rb17:
              - in-cc: ''
              - lat: '54.98656134974328'
              - long: '-3.259540348679'
              - pop: ''
              - snippet: is 8 miles north-west of Annan , and commands a fine
              - type: ppl
            - Annan-rb18:
              - in-cc: ''
              - lat: '54.98656134974328'
              - long: '-3.259540348679'
              - pop: ''
              - snippet: valley is washed by the Annan , and lies open from
              - type: ppl
            ....   
        - lang_model: geoparser_original
        - page_filename: alto/97440572.34.xml
        - text_unit id: Page252
        - title: topographical, statistical, and historical gazetteer of Scotland

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    gazetteer = config["gazetteer"]
    if "bounding_box" in config:
        bounding_box = " -lb " + config["bounding_box"] + " 2"
    else:
        bounding_box = ""
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    documents = archives.flatMap(
        lambda archive: [(document.year, document.title, document.edition, \
                          document.archive.filename, document) for document in list(archive)])

    # [(tittle, edition, year, archive name, page filename, text_unit, text_unit_id,
    #   num_text_unit, type of archive, type of disribution, model, clean_page)]

    pages_clean = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                                year_document[3], page.code, page.page_id, clean_page_as_string(page,defoe_path, os_type)) for page in year_document[4]])

    geo_xml_pages = pages_clean.flatMap(
        lambda clean_page: [(clean_page[0], clean_page[1], clean_page[2],\
                               clean_page[3], clean_page[4], clean_page[5], query_utils.geoparser_cmd(clean_page[6], defoe_path, os_type, gazetteer, bounding_box))])

    matching_pages = geo_xml_pages.map(lambda geo_page: (
        geo_page[0], {
            "title": geo_page[1],
            "edition": geo_page[2],
            "archive": geo_page[3],
            "page_filename": geo_page[4],
            "text_unit id": geo_page[5],
            "lang_model": "geoparser_original",
            "georesolution_page": query_utils.geoparser_coord_xml(geo_page[6])
        }))


    result = matching_pages \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words, for keywords and groups by date.

    Data in ES have the following colums:

    "title",  "edition", "year", "place", "archive_filename", 
    "source_text_filename", "text_unit", "text_unit_id", 
    "num_text_unit", "type_archive", "model", "source_text_raw", 
    "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem",
    "num_words"

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:
          [(year, [(title, edition, archive_filename, filename, word,corcondance),
              (title, edition, archive_filename, filename, word, concordance ), ...]), ...]


    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    window = 20
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.source_text_clean)

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    # [(year, document, title, edition, archive_filename), ...]
    documents = archives.flatMap(lambda archive: [(
        document.year, document, document.title, document.edition, document.
        archive.filename) for document in list(archive)])
    # [(year, page_string)

    #(year, title, edition, archive_filename, page_code, clean_page_string)
    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], year_document[2], year_document[3], year_document[
            4], page.code, clean_page_as_string(
                page, defoe_path, os_type)) for page in year_document[1]])

    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    pages = clean_pages.flatMap(lambda cl_page: [(
        cl_page[0], cl_page[1], cl_page[2], cl_page[3], cl_page[4],
        preprocess_clean_page(cl_page[5], preprocess_type))])
    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[5] for keysentence in keysentences))

    # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...]
    maching_idx = filter_pages.map(lambda year_page: (
        (year_page[0], year_page[1], year_page[2], year_page[3], year_page[4],
         year_page[5], get_text_keyword_idx(year_page[5], keysentences))))

    # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])]
    concordance_words = maching_idx.flatMap(
        lambda year_idx: [
            (year_idx[0],
                {"title":year_idx[1], "edition": year_idx[2], "archive_filename": year_idx[3], "filename":year_idx[4], "term": word_idx[0],\
                 "snippet": get_concordance_string(year_idx[5], word_idx[0], word_idx[1], window)})\
                 for word_idx in year_idx[6]])

    result = concordance_words.groupByKey() \
        .map(lambda year_match:
             (year_match[0], list(year_match[1]))) \
        .collect()
    return result
Esempio n. 6
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by words.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system.

    Returns result of form:

        {
          <WORD>:
          [
            [<YEAR>, <NUM_WORDS>],
            ...
          ],
          <WORD>:
          ...

        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [(year, page_string)

    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], clean_page_as_string(page, defoe_path, os_type)) for page in
                                                           year_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[
        0], preprocess_clean_page(cl_page[1], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(lambda year_page: (year_page[
        0], get_sentences_list_matches(year_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(keysentence, (year, num_keysentences)), ...]
    # =>
    # [(keysentence, [year, num_keysentences]), ...]
    result = matching_sentences \
        .reduceByKey(add) \
        .map(lambda yearsentence_count:
             (yearsentence_count[0][1],
              (yearsentence_count[0][0], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    The query counts as a “hit” every time that finds a page with a particular 
    term from a lexicon and it groups the results by years.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

    If a term appears several times in the same page, it will be still counted as “1”.
            Example:
            1795:
            - - kail
              - 1
            - - aff
              - 4
            - - lairds
              - 1
    That means that kail appears in 1 page , aff in 4 pages and lairds in 1 page across all the books in the year 1795.

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type= "sys-i386-snow-leopard"
    else:
            os_type = "sys-i386-64"
    if "defoe_path" in config :
        defoe_path= config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config, os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [query_utils.preprocess_word(
                word, preprocess_type) for word in k_split]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(
        lambda archive: [(document.year, document) for document in list(archive)])
    # [(year, page_string)
    
    
    clean_pages = documents.flatMap(
        lambda year_document: [(year_document[0],  
                                    clean_page_as_string(page, defoe_path, os_type)) 
                                       for page in year_document[1]])
    pages = clean_pages.flatMap(
        lambda cl_page: [(cl_page[0], 
                                    preprocess_clean_page(cl_page[1], preprocess_type))]) 
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(
        lambda year_page: any(
            keysentence in year_page[1] for keysentence in keysentences))
    
    
    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(
        lambda year_page: (year_page[0],
                              get_sentences_list_matches(
                                  year_page[1],
                                  keysentences)))
    

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(
        lambda year_sentence: [((year_sentence[0], sentence), 1)
                               for sentence in year_sentence[1]])


    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Esempio n. 8
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by year.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords/keysentences and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [(year, page_string)

    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], clean_page_as_string(page)) for page in year_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[
        0], preprocess_clean_page(cl_page[1], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(lambda year_page: (year_page[
        0], get_sentences_list_matches(year_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Esempio n. 9
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    The query counts as a “hint” every time that finds a term from our lexicon
    and group the results by books.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

         -  '''Twas on the morn of sweet May Day':
                - - neu
                     - 1
                - - blaw
                     - 5
     That means that neu appears in once of the book 'Twas on the morn of sweet May Day'. 
     And blaw appears 5 times in the same book. 

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped bytitle
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(
        document.title, document) for document in list(archive)])

    # [(year, page_string)

    clean_pages = documents.flatMap(lambda title_document: [(title_document[
        0], clean_page_as_string(page, defoe_path, os_type)) for page in
                                                            title_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[
        0], preprocess_clean_page(cl_page[1], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda title_page: any(
        keysentence in title_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(lambda title_page: (title_page[
        0], get_sentences_list_matches_per_page(title_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda title_sentence: [(
        (title_sentence[0], sentence), 1) for sentence in title_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda titlesentence_count:
             (titlesentence_count[0][0],
              (titlesentence_count[0][1], titlesentence_count[1]))) \
        .groupByKey() \
        .map(lambda title_sentencecount:
             (title_sentencecount[0], list(title_sentencecount[1]))) \
        .collect()
    return result
Esempio n. 10
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    It ingest NLS pages, applies scpaCy NLP pipeline for identifying the possible locations of each page. 
    And it applies the edinburgh geoparser (just the georesolver) for getting the latituted and longitude of each of them.
    
    Before applying the spaCy NLP, two clean steps are applied - long-S and hyphen words. 
    
    A config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    A config_file should be indicated to specify the lang_model, gazetteer to use, 
    the defoe_path, the bounding box (optional), as well as the operating system. 
    
    Example:
      - 1842:
        - archive: /home/rosa_filgueira_vicente/datasets/sg_simple_sample/97437554
        - edition: 1842, Volume 1
        - georesolution_page:
            - Aberdeenshire-19:
              - in-cc: ''
              - lat: '57.21923117162595'
              - long: '-2.801013003249016'
              - pop: ''
              - snippet: 'BUCHAN , a district of Aberdeenshire , extending along the coast '
              - type: civila
            - Cumberland-12:
              - in-cc: ''
              - lat: '51.4342921249674'
              - long: '-0.6131610294930387'
              - pop: ''
              - snippet: 'all the low country of Cumberland lies full before you , '
              - type: fac
             ....
        - lang_model: en_core_web_lg
        - page_filename: alto/97440572.34.xml
        - text_unit id: Page252
        - title: topographical, statistical, and historical gazetteer of Scotland
    

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    
    lang_model = config["lang_model"]
    gazetteer = config["gazetteer"]
    if "bounding_box" in config:
        bounding_box = " -lb " + config["bounding_box"] + " 2"
    else:
        bounding_box = ""
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type= "sys-i386-snow-leopard"
    else:
            os_type = "sys-i386-64"
    if "defoe_path" in config :
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"
    documents = archives.flatMap(
        lambda archive: [(document.year, document.title, document.edition, \
                          document.archive.filename, document) for document in list(archive)])
    
    pages_clean = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                                year_document[3], page.code, page.page_id, clean_page_as_string(page,defoe_path, os_type)) for page in year_document[4]])

    matching_pages = pages_clean.map(
        lambda geo_page:
        (geo_page[0],
         {"title": geo_page[1],
          "edition": geo_page[2],
          "archive": geo_page[3], 
          "page_filename": geo_page[4],
          "text_unit id": geo_page[5],
          "lang_model": lang_model, 
          "georesolution_page": georesolve_page_2(geo_page[6],lang_model, defoe_path, gazetteer, bounding_box)}))
    
    result = matching_pages \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words (here it is configured to 40), for keywords and groups by date.
    Store the snippet (40 words before and after each term). 

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 
    
    
    Returns result of form:
        {
          <YEAR>:
          [
            [- archive_filename: 
             - edition:
             - filename:
             - snippet:
             - term
             - title ]
            ...
          ],
          <YEAR>:
          ...
        }
  


    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    window = 40
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    # [(year, document, title, edition, archive_filename), ...]
    documents = archives.flatMap(lambda archive: [(
        document.year, document, document.title, document.edition, document.
        archive.filename) for document in list(archive)])
    # [(year, page_string)

    #(year, title, edition, archive_filename, page_code, clean_page_string)
    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], year_document[2], year_document[3], year_document[
            4], page.code, clean_page_as_string(
                page, defoe_path, os_type)) for page in year_document[1]])

    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    pages = clean_pages.flatMap(lambda cl_page: [(
        cl_page[0], cl_page[1], cl_page[2], cl_page[3], cl_page[4],
        preprocess_clean_page(cl_page[5], preprocess_type))])
    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[5] for keysentence in keysentences))

    # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...]
    maching_idx = filter_pages.map(lambda year_page: (
        (year_page[0], year_page[1], year_page[2], year_page[3], year_page[4],
         year_page[5], get_text_keysentence_idx(year_page[5], keysentences))))

    # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])]
    concordance_words = maching_idx.flatMap(
        lambda year_idx: [
            (year_idx[0],
                {"title":year_idx[1], "edition": year_idx[2], "archive_filename": year_idx[3], "filename":year_idx[4], "term": word_idx[0],\
                 "snippet": get_concordance_string(year_idx[5], word_idx[0], word_idx[1], window)})\
                 for word_idx in year_idx[6]])

    result = concordance_words.groupByKey() \
        .map(lambda year_match:
             (year_match[0], list(year_match[1]))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets the concordance (also called details) occurrences of keywords or keysentences and groups by year.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system.

    Returns result of form:

        {
          <YEAR>:
          [
            - [title:
             place:
             publisher:
             snippet:
             term:
             document_id:
             filenanme]
            - []
            ...
          ],
          <YEAR>:
          ...
        }
        
       

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [(year, page_string)

    clean_pages = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[
            1], page, clean_page_as_string(page, defoe_path, os_type))
                               for page in year_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[0], cl_page[
        1], cl_page[2], preprocess_clean_page(cl_page[3], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(
        lambda year_page: (year_page[0], year_page[1], year_page[
            2], get_sentences_list_matches(year_page[3], keysentences)))

    matching_sentences = matching_pages.flatMap(
        lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], sentence)\
                                for sentence in year_sentence[3]])

    matching_data = matching_sentences.map(
        lambda page_data: (page_data[0], {
            "title": page_data[1].title,
            "place": page_data[1].place,
            "publisher": page_data[1].publisher,
            "page_number": page_data[2].code,
            "snippet": page_data[2].content,
            "term": page_data[3],
            "document_id": page_data[1].code,
            "filename": page_data[1].archive.filename
        }))


    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result