Example #1
0
def redis_zincr_words(pipe, filename, minlength, maxlength):
    """Create news sorted set in redis.

    :param minlength: -- (int) Minimum words length inserted
    :param maxlength: -- (int) Maximum words length inserted
    :param filename: -- The absolute path to the file.gz to process.

    Representation of the set in redis:

    +------------+------------+-----------+
    |     Keys   | Members    | Scores    |
    +============+============+===========+
    | 20131001   | word1      | 142       |
    +------------+------------+-----------+
    | ...        | word2      | 120       |
    +------------+------------+-----------+
    | 20131002   | ...        | ...       |
    +------------+------------+-----------+

    This function store all words between minlength and maxlength in redis.
    Redis will count as well how much time each word will appear by day:
    The cardinality.

    """
    tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)

    with gzip.open(filename, 'rb') as F:

        blob = TextBlob(clean(F.read()), tokenizer = tokenizer)

        for word in blob.tokens:

            if (len(word) >= minlength) and (len(word) <= maxlength):
                pipe.zincrby(filename[-22:-12].replace('/',''), word, 1)

            if (len(word) >= maxlength):
                publisher.info("word bigger than {0} detected at {1}".format(maxlength, filename))
                publisher.info(word)

        pipe.execute()
Example #2
0
def classify_token_paste(r_serv, listname, choicedatastruct, nb, r_set):
    """Tokenizing on word category

    :param r_serv: -- Redis database connexion
    :param listname: -- (str) path to the file containing the list of path of category files
    :param choicedatastruct: -- (bool) Changing the index of datastructure
    :param nb: -- (int) Number of pastes proceeded by function

    Redis data structures cas be choose as follow:

    +---------------+------------+-----------+
    |     Keys      | Members    | Scores    |
    +===============+============+===========+
    | mails_categ   | filename   | 25000     |
    +---------------+------------+-----------+
    | ...           | filename2  | 2400      |
    +---------------+------------+-----------+
    | web_categ     | ...        | ...       |
    +---------------+------------+-----------+

    Or

    +--------------+-------------+-----------+
    |     Keys     | Members     | Scores    |
    +==============+=============+===========+
    | filename     | mails_categ | 100000    |
    +--------------+-------------+-----------+
    | ...          | web_categ   | 24050     |
    +--------------+-------------+-----------+
    | filename2    | ...         | ...       |
    +--------------+-------------+-----------+

    This function tokenise on all special characters like: @^\|[{#~}]!:;$^=
    And insert data in redis if the token match the keywords in a list previously
    created.
    These lists of keywords can be list of everything you want but it's better
    to create "category" of keywords.

    """

    try:
        for n in xrange(0,nb):
            filename = r_serv.lpop(r_set)

            if filename != None:

                tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
                set_listof_pid(r_serv, filename, sys.argv[0])

                with open(listname, 'rb') as L:
                    # for each "categ" listed in the file
                    for num, fname in enumerate(L):
                        # contain keywords by categ
                        tmp_list = []
                        #for each keywords
                        with open(fname[:-1], 'rb') as LS:

                            for num, kword in enumerate(LS):
                                tmp_list.append(kword[:-1])

                            # for each paste
                            with gzip.open(filename, 'rb') as F:

                                blob = TextBlob(clean(F.read()),
                                tokenizer = tokenizer)

                                # for each paste token
                                for word in blob.tokens.lower():

                                    if word in tmp_list:
                                        # choosing between two data structures.
                                        if choicedatastruct:
                                            r_serv.zincrby(filename,
                                                fname.split('/')[-1][:-1],
                                                1)
                                        else:
                                            r_serv.zincrby(fname.split('/')[-1][:-1],
                                            filename,
                                            1)

                update_listof_pid(r_serv)

            else:
                publisher.debug("Empty list")
                #r_serv.save()
                break

    except (KeyboardInterrupt, SystemExit) as e:
        flush_list_of_pid(r_serv)
        publisher.debug("Pid list flushed")