Example #1
0
def test_chunk_json_lines_with_remainder(fs):
    out_file_path = "wordforms"
    out_file = open(out_file_path, 'w')
    write_json_lines(out_file, iterate_wf(["wf1", "wf2", "wf3", "wf4", "wf5"]))
    out_file.close()
    out_file = open(out_file_path, 'r')
    res = list(chunk_json_lines(out_file, 2))
    out_file.close()
    outp = [
        [{
            "wordform": "wf1"
        }, {
            "wordform": "wf2"
        }],
        [{
            "wordform": "wf3"
        }, {
            "wordform": "wf4"
        }],
        [{
            "wordform": "wf5"
        }],
    ]

    assert outp == res
Example #2
0
def connect_anahashes_to_wordforms(session, anahashes, df, batch_size=50000):
    """
    Create the relation between wordforms and anahashes in the database.

    Given `anahashes`, a dataframe with wordforms and corresponding anahashes,
    create the relations between the two in the wordforms and anahashes tables
    by setting the anahash_id foreign key in the wordforms table.
    """
    LOGGER.info('Connecting anahashes to wordforms.')

    LOGGER.debug('Getting wordform/anahash_id pairs.')
    with get_temp_file() as anahash_to_wf_file:
        total_lines_written = write_json_lines(
            anahash_to_wf_file, get_anahashes(session, anahashes, df))

        update_statement = Wordform.__table__.update(). \
            where(Wordform.wordform_id == bindparam('wf_id')). \
            values(anahash_id=bindparam('a_id'))

        LOGGER.debug('Adding the connections wordform -> anahash_id.')
        sql_query_batches(session, update_statement,
                          read_json_lines(anahash_to_wf_file),
                          total_lines_written, batch_size)

    LOGGER.info('Added the anahash of %s wordforms.', total_lines_written)

    return total_lines_written
Example #3
0
def test_read_and_write_json_lines_empty(fs):
    objects = []

    fname = 'objects'

    f = open(fname, 'w')

    write_json_lines(f, objects)

    f.close()
    f = open(fname, 'r')

    assert os.path.exists(fname)
    assert os.path.getsize(fname) == 0

    results = [o for o in read_json_lines(f)]
    f.close()

    assert objects == results
Example #4
0
def test_chunk_json_lines_without_remainder(fs):
    out_file_path = "wordforms"
    out_file = open(out_file_path, 'w')
    write_json_lines(out_file, iterate_wf(["wf1", "wf2", "wf3", "wf4"]))
    out_file.close()

    out_file = open(out_file_path, 'r')
    res = list(chunk_json_lines(out_file, 2))
    out_file.close()

    outp = [[{
        'wordform': 'wf1'
    }, {
        'wordform': 'wf2'
    }], [{
        'wordform': 'wf3'
    }, {
        'wordform': 'wf4'
    }]]

    assert outp == res
Example #5
0
def add_morphological_paradigms(session, in_file):
    """
    Add morphological paradigms to database from CSV file.
    """
    data = pd.read_csv(in_file,
                       sep='\t',
                       index_col=False,
                       names=[
                           'wordform', 'corpus_freq', 'component_codes',
                           'human_readable_c_code', 'first_year', 'last_year',
                           'dict_ids', 'pos_tags', 'int_ids'
                       ])
    # drop first row (contains empty wordform)
    data = data.drop([0])

    # store wordforms for in database
    wfs = data[['wordform']].copy()
    bulk_add_wordforms(session, wfs)

    # get the morphological variants from the pandas dataframe
    LOGGER.info('extracting morphological variants')
    morph_paradigms_per_wordform = defaultdict(list)
    with tqdm(total=data.shape[0]) as pbar:
        for row in data.iterrows():
            codes = row[1]['component_codes'].split('#')
            wordform = row[1]['wordform']
            for code in codes:
                morph_paradigms_per_wordform[wordform].append(
                    split_component_code(code, wordform))
            pbar.update()

    LOGGER.info('Looking up wordform ids.')
    select_statement = select([Wordform
                               ]).where(Wordform.wordform.in_(wfs['wordform']))
    mapping = session.execute(select_statement).fetchall()

    LOGGER.info('Writing morphological variants to file.')
    with get_temp_file() as mp_file:
        total_lines_written = write_json_lines(
            mp_file, morph_iterator(morph_paradigms_per_wordform, mapping))
        LOGGER.info('Wrote %s morphological variants.', total_lines_written)
        LOGGER.info('Inserting morphological variants to the database.')
        sql_insert_batches(session,
                           MorphologicalParadigm,
                           read_json_lines(mp_file),
                           batch_size=50000)
Example #6
0
def test_read_and_write_json_lines(fs):
    objects = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}]

    fname = 'objects'

    f = open(fname, 'w')

    total = write_json_lines(f, objects)
    f.close()
    f = open(fname, 'r')

    assert os.path.exists(fname)
    assert total == len(objects)

    results = [o for o in read_json_lines(f)]
    f.close()

    assert objects == results
Example #7
0
def add_corpus_core(session,
                    corpus_matrix,
                    vectorizer,
                    corpus_name,
                    document_metadata=pd.DataFrame(),
                    batch_size=50000):
    """
    Add a corpus to the database.

    A corpus is a collection of documents, which is a collection of words.
    This function adds all words as wordforms to the database, records their
    "attestation" (the fact that they occur in a certain document and with what
    frequency), adds the documents they belong to, adds the corpus and adds the
    corpus ID to the documents.

    Inputs:
        session: SQLAlchemy session (e.g. from `dbutils.get_session`)
        corpus_matrix: the dense corpus term-document matrix, like from
                       `tokenize.terms_documents_matrix_ticcl_frequency`
        vectorizer: the terms in the term-document matrix, as given by
                    `tokenize.terms_documents_matrix_ticcl_frequency`
        corpus_name: the name of the corpus in the database
        document_metadata: see `ticclat_schema.Document` for all the possible
                           metadata. Make sure the index of this dataframe
                           matches with the document identifiers in the term-
                           document matrix, which can be easily achieved by
                           resetting the index for a Pandas dataframe.
        batch_size: batch handling of wordforms to avoid memory issues.
    """
    with get_temp_file() as wf_file:
        write_json_lines(wf_file, iterate_wf(vectorizer.vocabulary_))

        # Prepare the documents to be added to the database
        LOGGER.info('Creating document data')
        corpus_csr = scipy.sparse.csr_matrix(corpus_matrix)
        word_counts = corpus_csr.sum(axis=1)  # sum the rows

        wc_list = np.array(word_counts).flatten().tolist()

        document_metadata['word_count'] = wc_list

        # Determine which wordforms in the vocabulary need to be added to the
        # database
        LOGGER.info('Determine which wordforms need to be added')
        with get_temp_file() as wf_to_add_file:
            with tqdm(total=count_lines(wf_file)) as pbar:
                for chunk in chunk_json_lines(wf_file, batch_size=batch_size):
                    # Find out which wordwordforms are not yet in the database
                    wordforms = {wf['wordform'] for wf in chunk}
                    select_statement = select([Wordform]).where(
                        Wordform.wordform.in_(wordforms))
                    result = session.execute(select_statement).fetchall()

                    # wf: (id, wordform, anahash_id, wordform_lowercase)
                    existing_wfs = {wf[1] for wf in result}
                    for wordform in wordforms.difference(existing_wfs):
                        wf_to_add_file.write(
                            json.dumps({
                                'wordform': wordform,
                                'wordform_lowercase': wordform.lower()
                            }))
                        wf_to_add_file.write('\n')
                    pbar.update(batch_size)

            # Create the corpus (in a session) and get the ID
            LOGGER.info('Creating the corpus')
            corpus = Corpus(name=corpus_name)
            session.add(corpus)

            # add the documents using ORM, because we need to link them to the
            # corpus
            LOGGER.info('Adding the documents')
            for doc in document_metadata.to_dict(orient='records'):
                document_obj = Document(**doc)
                document_obj.document_corpora.append(corpus)
            session.flush()
            corpus_id = corpus.corpus_id

            # Insert the wordforms that need to be added using SQLAlchemy core (much
            # faster than using the ORM)
            LOGGER.info('Adding the wordforms')
            bulk_add_wordforms_core(session, read_json_lines(wf_to_add_file))

    LOGGER.info('Prepare adding the text attestations')
    # make a mapping from
    df = pd.DataFrame.from_dict(vectorizer.vocabulary_, orient='index')
    df = df.reset_index()

    LOGGER.info('\tGetting the wordform ids')
    wf_mapping = {}

    for chunk in chunk_df(df, batch_size=batch_size):
        to_select = list(chunk['index'])
        select_statement = select([Wordform
                                   ]).where(Wordform.wordform.in_(to_select))
        result = session.execute(select_statement).fetchall()
        for wordform in result:
            # wordform: (id, wordform, anahash_id, wordform_lowercase)
            wf_mapping[wordform[1]] = wordform[0]

    LOGGER.info('\tGetting the document ids')
    # get doc_ids
    select_statement = select([corpusId_x_documentId.join(Corpus).join(Document)]) \
        .where(Corpus.corpus_id == corpus_id).order_by(Document.document_id)
    result = session.execute(select_statement).fetchall()
    # row: (corpus_id, document_id, ...)
    doc_ids = [row[1] for row in result]

    LOGGER.info('\tReversing the mapping')
    # reverse mapping from wordform to id in the terms/document matrix
    word_from_tdmatrix_id = dict(
        zip(vectorizer.vocabulary_.values(), vectorizer.vocabulary_.keys()))

    LOGGER.info('\tGetting the text attestations')
    with get_temp_file() as ta_file:
        write_json_lines(
            ta_file,
            get_tas(corpus_matrix, doc_ids, wf_mapping, word_from_tdmatrix_id))

        LOGGER.info('Adding the text attestations')
        total = count_lines(ta_file)
        bulk_add_textattestations_core(session,
                                       read_json_lines(ta_file),
                                       total=total,
                                       batch_size=batch_size)