list_id = list_id.lstrip('(') # print(list_id) if not len(list_id): print('list_id,', list_id, ' zero length... go to next record') continue if not list_id.isdigit(): print('list_id not numeric... go to next record') continue curr_string = ' '.join(current_string[1:]) # print('starting with text :', curr_string) # print('working on list :',list_id) # list_dict = get_list_dists(list_id) c = Counter() parsed_text = f.parse(curr_string, is_list=True) # print(parsed_text) if parsed_text is not None: c.update(parsed_text['text']) c.update(dict.fromkeys(parsed_text['bigrams'], 1)) c.update(dict.fromkeys(parsed_text['entities'], 1)) # else: # print('parsed text is empty') # cycle through dictionary # Write a file if not c.items(): out_file.write('\n') out_file.write('for list_id = ') out_file.write(list_id) out_file.write('parsed text is empty\n')
rows = cursor.fetchall() for row in rows: member_id = row[0] mlistcount = row[1] print('working on member_id', ' ', member_id) cursor.execute(get_listinfo_for_member, [member_id]) tstrout = '' rows = cursor.fetchall() for row in rows: # for current_line in reader: c_line = str(row) # c_line=str(current_line.strip()) c_line = ''.join(filter(lambda x: x in string.printable, c_line)) print(c_line) if len(c_line): s = f.parse(c_line, True) strout = '' if s is not None: for item in s.items(): # print(1,str(item[1])) word = [] for word in item[1]: strout = strout + ' ' + str(word) if len(strout): tstrout = tstrout + ' ' + strout # print(tstrout) words = nltk.tokenize.word_tokenize(tstrout) the_list_dist = FreqDist(words) # print('for',member_id,'on',mlistcount[0],'lists:',the_list_dist.most_common(10))
# list_id = c_line[:c_line.find(',')] list_id = c_line[0] if not list_id.isdigit(): # reader_csv.next() continue c_line = str(c_line.strip()) # c_string is everything after the list_id c_string = ' '.join(c_line[1:]) # print('working on list :', list_id, file=sys.stderr) print('working on list :', list_id) c_string = str(c_string.strip()) c_string = ''.join(filter(lambda x: x in string.printable, c_string)) if len(c_string): print('printable chars:', c_string) s = f.parse(c_string, True) strout = '' if s is not None: for item in s.items(): # print(1,str(item[1])) word = [] for word in item[1]: strout = strout + ' ' + word if len(strout): tstrout = tstrout + ' ' + strout print(tstrout) words = nltk.tokenize.word_tokenize(tstrout) the_list_dist = FreqDist(words) print(the_list_dist.most_common(20))
# list_id = c_line[:c_line.find(',')] list_id = c_line[0] if not list_id.isdigit(): # reader_csv.next() continue c_line=str(c_line.strip()) # c_string is everything after the list_id c_string = ' '.join(c_line[1:]) # print('working on list :', list_id, file=sys.stderr) print('working on list :',list_id) c_string=str(c_string.strip()) c_string = ''.join(filter(lambda x: x in string.printable, c_string)) if len(c_string): print('printable chars:',c_string) s = f.parse(c_string,True) strout = '' if s is not None: for item in s.items(): # print(1,str(item[1])) word = [] for word in item[1]: strout = strout+' '+word if len(strout): tstrout = tstrout+' '+strout print(tstrout) words = nltk.tokenize.word_tokenize(tstrout) the_list_dist = FreqDist(words) print(the_list_dist.most_common(20))
list_id = list_id.lstrip('(') # print(list_id) if not len(list_id): print('list_id,',list_id,' zero length... go to next record') continue if not list_id.isdigit(): print('list_id not numeric... go to next record') continue curr_string = ' '.join(current_string[1:]) # print('starting with text :', curr_string) # print('working on list :',list_id) # list_dict = get_list_dists(list_id) c = Counter() parsed_text = f.parse(curr_string, is_list=True) # print(parsed_text) if parsed_text is not None: c.update(parsed_text['text']) c.update(dict.fromkeys(parsed_text['bigrams'], 1)) c.update(dict.fromkeys(parsed_text['entities'], 1)) # else: # print('parsed text is empty') # cycle through dictionary # Write a file if not c.items(): out_file.write('\n') out_file.write('for list_id = ') out_file.write(list_id)
rows = cursor.fetchall() for row in rows: member_id = row[0] mlistcount = row[1] print('working on member_id',' ',member_id) cursor.execute(get_listinfo_for_member,[member_id]) tstrout = '' rows = cursor.fetchall() for row in rows: # for current_line in reader: c_line=str(row) # c_line=str(current_line.strip()) c_line = ''.join(filter(lambda x: x in string.printable, c_line)) print(c_line) if len(c_line): s = f.parse(c_line,True) strout = '' if s is not None: for item in s.items(): # print(1,str(item[1])) word = [] for word in item[1]: strout = strout+' '+str(word) if len(strout): tstrout = tstrout+' '+strout # print(tstrout) words = nltk.tokenize.word_tokenize(tstrout) the_list_dist = FreqDist(words) # print('for',member_id,'on',mlistcount[0],'lists:',the_list_dist.most_common(10))