list_id = list_id.lstrip('(')
        # print(list_id)
        if not len(list_id):
            print('list_id,', list_id, ' zero length... go to next record')
            continue
        if not list_id.isdigit():
            print('list_id not numeric... go to next record')
            continue
        curr_string = ' '.join(current_string[1:])
        # print('starting with text :', curr_string)
        # print('working on list :',list_id)
        # list_dict = get_list_dists(list_id)

        c = Counter()

        parsed_text = f.parse(curr_string, is_list=True)
        # print(parsed_text)
        if parsed_text is not None:
            c.update(parsed_text['text'])
            c.update(dict.fromkeys(parsed_text['bigrams'], 1))
            c.update(dict.fromkeys(parsed_text['entities'], 1))
            # else:
            #     print('parsed text is empty')
            # cycle through dictionary
            # Write a file

            if not c.items():
                out_file.write('\n')
                out_file.write('for list_id = ')
                out_file.write(list_id)
                out_file.write('parsed text is empty\n')
Beispiel #2
0
    rows = cursor.fetchall()
    for row in rows:
        member_id = row[0]
        mlistcount = row[1]
        print('working on member_id', ' ', member_id)
        cursor.execute(get_listinfo_for_member, [member_id])
        tstrout = ''
        rows = cursor.fetchall()
        for row in rows:
            # for current_line in reader:
            c_line = str(row)
            # c_line=str(current_line.strip())
            c_line = ''.join(filter(lambda x: x in string.printable, c_line))
            print(c_line)
            if len(c_line):
                s = f.parse(c_line, True)
                strout = ''
                if s is not None:
                    for item in s.items():
                        # print(1,str(item[1]))
                        word = []
                        for word in item[1]:
                            strout = strout + ' ' + str(word)

                if len(strout):
                    tstrout = tstrout + ' ' + strout
        # print(tstrout)
        words = nltk.tokenize.word_tokenize(tstrout)
        the_list_dist = FreqDist(words)

        # print('for',member_id,'on',mlistcount[0],'lists:',the_list_dist.most_common(10))
Beispiel #3
0
    # list_id = c_line[:c_line.find(',')]
    list_id = c_line[0]
    if not list_id.isdigit():
        # reader_csv.next()
        continue
    c_line = str(c_line.strip())
    # c_string is everything after the list_id
    c_string = ' '.join(c_line[1:])
    # print('working on list :', list_id, file=sys.stderr)
    print('working on list :', list_id)

    c_string = str(c_string.strip())
    c_string = ''.join(filter(lambda x: x in string.printable, c_string))
    if len(c_string):
        print('printable chars:', c_string)
        s = f.parse(c_string, True)
        strout = ''
        if s is not None:
            for item in s.items():
                # print(1,str(item[1]))
                word = []
                for word in item[1]:
                    strout = strout + ' ' + word
        if len(strout):
            tstrout = tstrout + ' ' + strout
print(tstrout)
words = nltk.tokenize.word_tokenize(tstrout)
the_list_dist = FreqDist(words)

print(the_list_dist.most_common(20))
Beispiel #4
0
    # list_id = c_line[:c_line.find(',')]
    list_id = c_line[0]
    if not list_id.isdigit():
        # reader_csv.next()
        continue
    c_line=str(c_line.strip())
    # c_string is everything after the list_id
    c_string = ' '.join(c_line[1:])
    # print('working on list :', list_id, file=sys.stderr)
    print('working on list :',list_id)

    c_string=str(c_string.strip())
    c_string = ''.join(filter(lambda x: x in string.printable, c_string))
    if len(c_string):
        print('printable chars:',c_string)
        s = f.parse(c_string,True)
        strout = ''
        if s is not None:
            for item in s.items():
                # print(1,str(item[1]))
                word = []
                for word in item[1]:
                    strout = strout+' '+word
        if len(strout):
            tstrout = tstrout+' '+strout
print(tstrout)
words = nltk.tokenize.word_tokenize(tstrout)
the_list_dist = FreqDist(words)

print(the_list_dist.most_common(20))
        list_id = list_id.lstrip('(')
        # print(list_id)
        if not len(list_id):
            print('list_id,',list_id,' zero length... go to next record')
            continue
        if not list_id.isdigit():
            print('list_id not numeric... go to next record')
            continue
        curr_string = ' '.join(current_string[1:])
        # print('starting with text :', curr_string)
        # print('working on list :',list_id)
        # list_dict = get_list_dists(list_id)

        c = Counter()

        parsed_text = f.parse(curr_string, is_list=True)
        # print(parsed_text)
        if parsed_text is not None:
            c.update(parsed_text['text'])
            c.update(dict.fromkeys(parsed_text['bigrams'], 1))
            c.update(dict.fromkeys(parsed_text['entities'], 1))
        # else:
        #     print('parsed text is empty')
        # cycle through dictionary
        # Write a file


            if not c.items():
                out_file.write('\n')
                out_file.write('for list_id = ')
                out_file.write(list_id)
    rows = cursor.fetchall()
    for row in rows:
        member_id = row[0]
        mlistcount = row[1]
        print('working on member_id',' ',member_id)
        cursor.execute(get_listinfo_for_member,[member_id])
        tstrout = ''
        rows = cursor.fetchall()
        for row in rows:
        # for current_line in reader:
            c_line=str(row)
            # c_line=str(current_line.strip())
            c_line = ''.join(filter(lambda x: x in string.printable, c_line))
            print(c_line)
            if len(c_line):
                s = f.parse(c_line,True)
                strout = ''
                if s is not None:
                    for item in s.items():
                        # print(1,str(item[1]))
                        word = []
                        for word in item[1]:
                            strout = strout+' '+str(word)

                if len(strout):
                    tstrout = tstrout+' '+strout
        # print(tstrout)
        words = nltk.tokenize.word_tokenize(tstrout)
        the_list_dist = FreqDist(words)

        # print('for',member_id,'on',mlistcount[0],'lists:',the_list_dist.most_common(10))