Example #1
0
def parseStreamFromTime(year, month, day, streamDir, parsedDir):  # start from one specific day
    cur_time = datetime.datetime(year, month, day, 0)
    cur_suffix = cur_time.strftime("%Y-%m-%d-%H")
    # suffix2 = cur_time.strftime('%Y-%m-%d')
    # cur_day = cur_time.strftime('%d')
    # cur_parsed_file = open(cur_parsed_path, 'w')
    while True:
        cur_stream_path = "%s/statuses.log.%s.gz" % (streamDir, cur_suffix)
        cur_parsed_path = "%s/statuses.parsed.%s" % (parsedDir, cur_suffix)
        if os.path.exists(cur_stream_path):  # current gzip file is ready
            print "current parsed stream file is %s (ready)" % cur_stream_path
            cur_parsed_file = open(cur_parsed_path, "w")
            with gzip.open(cur_stream_path, "rt") as cur_stream_file:
                for lineno, line in enumerate(cur_stream_file):
                    out_str = extractStatus(line)
                    if out_str == "":
                        continue  # skip empty tweet after stemmered, stopword removal
                    origin_str = extractOriginStatus(line)
                    out_str = out_str + "\t" + origin_str + "\n"
                    cur_parsed_file.write(out_str)
            cur_stream_file.close()
            cur_parsed_file.close()
            cur_time = cur_time + datetime.timedelta(hours=1)  # change to next hour
            # check whether day changes
            # new_day = cur_time.strftime('%d')
            # if new_day != cur_day:break
            cur_suffix = cur_time.strftime("%Y-%m-%d-%H")
        else:
            # sleep to wait
            print "current parsed stream file is %s (not ready), sleep to wait..." % cur_stream_path
            time.sleep(30)
Example #2
0
def parseCurStream():
    cur_time = datetime.datetime.now()
    suffix1 = cur_time.strftime("%Y-%m-%d-%H")
    suffix2 = cur_time.strftime("%Y-%m-%d-%H")
    cur_day = cur_time.strftime("%d")
    print "current parsed stream is %s" % suffix1
    cur_stream_path = "../statuses.log.%s" % suffix1
    cur_parsed_path = "./parsed/statuses.parsed.%s" % suffix2
    # cur_parsed_file = open(cur_parsed_path, 'w')

    # next hour stream
    next_time = cur_time + datetime.timedelta(hours=1)
    next_suffix = next_time.strftime("%Y-%m-%d-%H")
    next_stream_path = "../statuses.log.%s" % next_suffix

    while True:
        if os.path.exists(cur_stream_path):
            cur_stream_file = open(cur_stream_path, "r")
            cur_parsed_file = open(cur_parsed_path, "w")
            while True:
                line = cur_stream_file.readline()
                if not line and os.exists(next_stream_path):  # change to next hour
                    break
                out_str = extractStatus(line)
                if out_str == "":
                    continue  # skip empty tweet after stemmered, stopword removal
                origin_str = extractOriginStatus(line)
                out_str = out_str + "\t" + origin_str + "\n"
                cur_parsed_file.write(out_str)
            cur_stream_file.close()
            cur_parsed_file.close()
            # add one hour, update time
            cur_time = datetime.datetime.now()
            suffix1 = cur_time.strftime("%Y-%m-%d-%H")
            cur_stream_path = "../statuses.log.%s" % suffix1
            print "current parsed stream is %s" % suffix1
            next_time = cur_time + datetime.timedelta(hours=1)
            next_suffix = next_time.strftime("%Y-%m-%d-%H")
            next_stream_path = "../statuses.log.%s" % next_suffix
            # check whether day changes
            new_day = cur_time.strftime("%d")
            if new_day != cur_day:
                cur_parsed_file.close()
                suffix2 = cur_time.strftime("%Y-%m-%d")
                cur_parsed_path = "./parsed/statuses.parsed.%s" % suffix2
                cur_parsed_file = open(cur_parsed_path, "w")