コード例 #1
0
def road_stop_token(list_line, command, stop_en):
    list_token = []
    cnt = 0
    for line in list_line:
        if command == 'abbr':  # no need to use
            split_line = line.split('\t')
            # port = PorterStemmer()
            # try:
            #     stem_word = port.stem(split_line[0])
            # except UnicodeDecodeError:
            #     # do nothing
            #     print 'Wrong stemming'
            # print split_line[0], stem_word  # get the word and stemmer word
            print split_line[0]

        elif command == 'road':
            split_line = line.split(';')
            for element in split_line:
                tokens = element.split()
                for each in tokens:
                    each = filter_token(each)
                    if (each not in list_token) and (each not in stop_en):
                        if is_int(each) is False:
                            list_token.append(each)

        elif command == 'busstop':
            cnt += 1
            split_line = line.split('\t')
            # if ('code' not in line) and ('name' not in line):
            if cnt > 1:
                tokens = split_line[1].split()
                for each in tokens:
                    filter_each = filter_token(each.strip())
                    if (filter_each not in list_token) and (len(filter_each) > 0) and (each not in stop_en):
                        if is_int(each) is False:
                            list_token.append(filter_each.strip())

        elif command == 'bussvc':
            cnt += 1
            split_line = line.split('\t')
            # if ('no' not in line) and ('routes' not in line) and ('type' not in line) and ('operator' not in line) and ('name' not in line):
            if cnt > 1:
                list_token.append(split_line[0].strip())
                # print split_line[0], cnt

    # for value in sorted(list_token):
    #     print value.lower()
    for value in list_token:
        print value.lower()
    print 'Total length of list: %i' % len(list_token)
コード例 #2
0
def filtering_tweetText(path, name_write):
    db = MySQLdb.connect(host="localhost", # your host, usually localhost
                     user="******", # your username
                      passwd="ducthong", # your password
                      db="twitter_bus") # name of the data base
    cur = db.cursor()
    list_write = []

    cnt = 0
    sql = 'select tweetID, tweetText from twitter_posts_distinct'
    cur.execute(sql)
    for row in cur.fetchall():
        tweetID = row[0]
        tweetText = filter_token(row[1])

        print (tweetID + '\t' + tweetText)
        list_write.append(tweetID + '\t' + tweetText)

    db.close()
    write_file(path, name_write, list_write)
コード例 #3
0
def filtering_facebookBusNews(path, name_write):
    db = MySQLdb.connect(host="localhost", # your host, usually localhost
                     user="******", # your username
                      passwd="ducthong", # your password
                      db="2015_allschemas") # name of the data base
    cur = db.cursor()
    list_write = []

    cnt = 0
    # sql = 'select facebookID, post from facebook_busnews'
    sql = 'select facebookID, post from facebook_busnews_ver2'
    cur.execute(sql)
    for row in cur.fetchall():
        tweetID = row[0]
        tweetText = filter_token(row[1])

        print (tweetID + '\t' + tweetText)
        list_write.append(tweetID + '\t' + tweetText)

    db.close()
    write_file(path, name_write, list_write)