예제 #1
0
def get_n_count_6grams(domain, minimum=3):
    """
    Extracts all 6-grams from the texts of the domain and counts frequency and all metrics in this domain for them.
    :param domain: int, domain_id from which you want extract n-grams.
    :param minimum: int, frequency threshold
    :return:
    """
    command = sql('getting_6grams.sql')
    cursor.execute(command, (domain, ))
    data = Counter(cursor.fetchall())
    log.debug('Counted!')

    n = get_domain_size(domain)
    verbose = True

    for ngram in data:
        colloc_freq = data[ngram]
        if colloc_freq >= minimum:
            w1, w2, w3, w4, w5, w6 = ngram
            if verbose:
                log.debug("Started! %s", ngram)

            command = sql('select_to_count_6grams.sql')
            cursor.execute(command.format(domain), (w1, w2, w3, w4, w5))
            id_5gram, pattern_freq = cursor.fetchone()

            cursor.execute(
                "SELECT freq{} FROM unigrams WHERE id_unigram = %s".format(
                    domain), (w6, ))
            lw_freq = cursor.fetchone()[0]
            pmisc = pmi(colloc_freq, pattern_freq, lw_freq, n)
            tsc = t_score(colloc_freq, pattern_freq, lw_freq, n)
            logdsc = logDice(colloc_freq, pattern_freq, lw_freq)

            # print(id_4gram, w5, colloc_freq, logdsc, pmisc, tsc)
            try:
                cursor.execute(
                    """
                    INSERT INTO 6grams (5gram, token, d{}_freq, d{}_logdice, d{}_pmi, d{}_tsc)
                    VALUES (%s, %s, %s, %s, %s, %s)
                    """.format(*[domain] * 4),
                    (id_5gram, w6, colloc_freq, logdsc, pmisc, tsc))

            except IntegrityError:
                cursor.execute(
                    """
                    UPDATE 6grams 
                        SET d{}_freq = %s, 
                            d{}_logdice = %s, 
                            d{}_pmi = %s, 
                            d{}_tsc = %s 
                        WHERE 5gram = %s 
                            AND token = %s
                    """.format(*[domain] * 4),
                    (colloc_freq, logdsc, pmisc, tsc, id_5gram, w6))
        if verbose:
            log.debug('Counted and inserted!')
            verbose = False
예제 #2
0
def count_all_domains_bigr(minimum=1):
    """

    :param minimum: n occurences per million
    :return:
    """

    log.info('Counting metrics!')
    cursor.execute(
        """
        SELECT 
            COUNT(*)
        FROM
            (SELECT 
                id_word
            FROM
                words) 
            as a
        """)
    n = cursor.fetchone()[0]
    log.info('Total corpus size %s', n)

    cursor.execute(sql('selecting_all_2grams.sql'))
    log.debug('Selected!')
    data = set()
    for _id, colloc_freq, pattern_freq, lw_freq in cursor:

        if colloc_freq/(n/1000000) >= minimum:
            pmisc = pmi(colloc_freq, pattern_freq, lw_freq, n)
            tsc = t_score(colloc_freq, pattern_freq, lw_freq, n)
            logdsc = logDice(colloc_freq, pattern_freq, lw_freq)

            data.add((logdsc, pmisc, tsc, _id))
        # log.debug('Counted %s', _id)
    log.info('Ready to insert')
    for i in data:
        cursor.execute(
            """
            UPDATE 2grams 
                SET 
                    logdice = %s,
                    pmi = %s,
                    tscore = %s
                WHERE
                    id_bigram = %s 
            """, i)
    cnx.commit()
    log.info('Metrics for bigram in all corpus are commited!')
예제 #3
0
def get_n_count_3grams(domain, minimum=3):
    """
    Extracts all 3-grams from the texts of the domain and counts frequency and all metrics in this domain for them.
    :param domain: int, domain_id from which you want extract n-grams.
    :param minimum: int, frequency threshold
    :return:
    """

    log.info('Counting trigrams for domain %s', domain)

    command = sql('getting_3grams.sql')

    cursor.execute(command, (domain, ))
    data = Counter(cursor.fetchall())
    log.debug('Counted!')
    n = get_domain_size(domain)
    verbose = True
    for trigram in data:
        if data[trigram] >= minimum:
            w1, w2, w3 = trigram
            if verbose:
                log.debug("Started! %s", trigram)
            if w1 % 100 == 0:
                log.debug("working with %s", w1)
            cursor.execute(
                """
                SELECT id_bigram, d{}_freq 
                FROM 2grams WHERE wordform_1 = %s AND wordform_2 = %s
                """.format(domain), (w1, w2))

            id_bigram, pattern_freq = cursor.fetchone()
            if pattern_freq == 0:
                print(w1, ' ', w2)
                print(trigram, data[trigram])

            cursor.execute(
                "SELECT freq{} FROM unigrams WHERE id_unigram = %s".format(
                    domain), (w3, ))
            lw_freq = cursor.fetchone()[0]
            if lw_freq == 0:
                print(w3)
            colloc_freq = data[trigram]

            pmisc = pmi(colloc_freq, pattern_freq, lw_freq, n)
            tsc = t_score(colloc_freq, pattern_freq, lw_freq, n)
            logdsc = logDice(colloc_freq, pattern_freq, lw_freq)

            try:
                cursor.execute(
                    """
                INSERT INTO 3grams (bigram, token, d{}_freq, d{}_logdice, d{}_pmi, d{}_tsc) 
                VALUES (%s, %s, %s, %s, %s, %s)
                """.format(*[domain] * 4),
                    (id_bigram, w3, colloc_freq, logdsc, pmisc, tsc))
            except IntegrityError:
                cursor.execute(
                    """
                UPDATE 3grams 
                SET d{}_freq = %s, 
                    d{}_logdice = %s, 
                    d{}_pmi = %s, 
                    d{}_tsc = %s 
                WHERE bigram = %s 
                    AND token = %s
                    """.format(*[domain] * 4),
                    (colloc_freq, logdsc, pmisc, tsc, id_bigram, w3))
    if verbose:
        log.debug('3-grams are counted and inserted!')
예제 #4
0
def count_2metrics(domain, minimum=3):
    """
    Counts all metrics for already extracted 2-grams with counted frequency.
    :param domain: int, domain_id, which size will be used to calculate related metrics.
    :param minimum: int, frequency threshold.
    :return:
    """

    log.info('Counting metrics!')
    cursor.execute(
        """
        SELECT 
            COUNT(*)
        FROM
            (SELECT 
                id_word
            FROM
                words, metadata
            WHERE
                words.id_text = metadata.id_text
                AND 
                    metadata.id_domain = (%s)
                    )
            AS sd
            """, (domain, ))
    n = cursor.fetchone()[0]
    log.info('Domain size %s', n)

    cursor.execute("""
        SELECT 
            id_bigram AS id,
            d{}_freq AS freq,
            w1.freq{} AS w1_freq,
            w2.freq{} AS w2_freq
        FROM
            2grams,
            unigrams AS w1,
            unigrams AS w2
        WHERE
            2grams.wordform_1 = w1.id_unigram
            AND 
                2grams.wordform_2 = w2.id_unigram
                """.format(domain, domain, domain))
    log.debug('Selected!')
    data = set()
    for _id, colloc_freq, pattern_freq, lw_freq in cursor:
        if colloc_freq >= minimum:
            pmisc = pmi(colloc_freq, pattern_freq, lw_freq, n)
            tsc = t_score(colloc_freq, pattern_freq, lw_freq, n)
            logdsc = logDice(colloc_freq, pattern_freq, lw_freq)
            data.add((logdsc, pmisc, tsc, _id))
        # log.debug('Counted %s', _id)
    log.info('Ready to insert22')
    for i in data:
        if i[-1] % 1000 == 0:
            print(i)
        cursor.execute(
            """
            UPDATE 2grams 
            SET 
                d{}_logdice = %s,
                d{}_pmi = %s,
                d{}_tsc = %s
            WHERE
                id_bigram = %s 
                """.format(domain, domain, domain), i)

    # cnx.commit()
    log.info('Commited!')