def road_stop_token(list_line, command, stop_en): list_token = [] cnt = 0 for line in list_line: if command == 'abbr': # no need to use split_line = line.split('\t') # port = PorterStemmer() # try: # stem_word = port.stem(split_line[0]) # except UnicodeDecodeError: # # do nothing # print 'Wrong stemming' # print split_line[0], stem_word # get the word and stemmer word print split_line[0] elif command == 'road': split_line = line.split(';') for element in split_line: tokens = element.split() for each in tokens: each = filter_token(each) if (each not in list_token) and (each not in stop_en): if is_int(each) is False: list_token.append(each) elif command == 'busstop': cnt += 1 split_line = line.split('\t') # if ('code' not in line) and ('name' not in line): if cnt > 1: tokens = split_line[1].split() for each in tokens: filter_each = filter_token(each.strip()) if (filter_each not in list_token) and (len(filter_each) > 0) and (each not in stop_en): if is_int(each) is False: list_token.append(filter_each.strip()) elif command == 'bussvc': cnt += 1 split_line = line.split('\t') # if ('no' not in line) and ('routes' not in line) and ('type' not in line) and ('operator' not in line) and ('name' not in line): if cnt > 1: list_token.append(split_line[0].strip()) # print split_line[0], cnt # for value in sorted(list_token): # print value.lower() for value in list_token: print value.lower() print 'Total length of list: %i' % len(list_token)
def filtering_tweetText(path, name_write): db = MySQLdb.connect(host="localhost", # your host, usually localhost user="******", # your username passwd="ducthong", # your password db="twitter_bus") # name of the data base cur = db.cursor() list_write = [] cnt = 0 sql = 'select tweetID, tweetText from twitter_posts_distinct' cur.execute(sql) for row in cur.fetchall(): tweetID = row[0] tweetText = filter_token(row[1]) print (tweetID + '\t' + tweetText) list_write.append(tweetID + '\t' + tweetText) db.close() write_file(path, name_write, list_write)
def filtering_facebookBusNews(path, name_write): db = MySQLdb.connect(host="localhost", # your host, usually localhost user="******", # your username passwd="ducthong", # your password db="2015_allschemas") # name of the data base cur = db.cursor() list_write = [] cnt = 0 # sql = 'select facebookID, post from facebook_busnews' sql = 'select facebookID, post from facebook_busnews_ver2' cur.execute(sql) for row in cur.fetchall(): tweetID = row[0] tweetText = filter_token(row[1]) print (tweetID + '\t' + tweetText) list_write.append(tweetID + '\t' + tweetText) db.close() write_file(path, name_write, list_write)