def main(argv):
    infile = argv[1]
    outfile1 = argv[2]
    outfile2 = argv[3]
    
    OUT1 = open(outfile1, 'w')
    OUT2 = open(outfile2, 'w')
    unicode_count = 0
    period = 60
    tag_dict = {}
    tag_dq = collections.deque([], maxlen=period+1)      ### maximum length of deque is 61. +1 because the issue of un-order tweet within 2s 
    last_time = ''
    
    k = 0
    N_v = 0
    d_total = 0
    escapes = ''.join([chr(char) for char in range(1, 32)])
    temp_list = 31 * ' '
    
    with open(infile) as IN:
        for line in IN:
            (content, time1, unicode_flag) = ut.tweets_cleaned_json(line, escapes, temp_list)
            if not time1:
                continue
            
            if unicode_flag == 1:    unicode_count += 1 
                
            OUT1.write(content + " (timestamp: " + time1 + ")\n")
            
            tag_list = re.findall('#[^ ]*', content)                              ### find all potential tags in text
            tag_list = [ut.clean_tag(x.strip('#')) for x in tag_list]             ### remove all heading '#' in tags 
            tag_list = filter(None, [y.lower() for x in tag_list for y in x])     ### remove empty tags and change to lower case
            tag_list = ['#'+x for x in tag_list]                                  ### add '#' back in front of tags

            temp_str = time1.split(" ")
            temp_str1 = temp_str[3].split(":")
            time_str = temp_str[1].strip() + ' ' + temp_str[2].strip() + ' ' + temp_str[5].strip() + ' ' + temp_str1[0].strip() + ' ' + temp_str1[1].strip() + ' ' + temp_str[2].strip()
            time2 = datetime.strptime(time_str, '%b %d %Y %H %M %S')
            N_v, d_total, last_time = ut.average_degree(tag_list, time2, N_v, d_total, tag_dict, tag_dq, last_time, period)
            if N_v == 0:
                av_degree = 0
            else:
                av_degree = float(d_total) / N_v 
                
            if k == 1:                                                           ### except for the first line, (newline + printing)
                OUT2.write("\n%.2f" % av_degree)   
            else:
                k = 1                   
                OUT2.write("%.2f" % av_degree)
    
    OUT1.write(str(unicode_count) + " tweets contained unicode.")
    OUT1.close()
    OUT2.close()
def main(argv):
    if len(argv) != 4:
        print "4 args required!"
        print "Usage code_challenge_hadoop.py input.txt output1.txt output2.txt"
        exit(1)

    current_dir = os.getcwd()
    infile = sys.argv[1].strip()
    outfile1 = sys.argv[2].strip()
    outfile2 = sys.argv[3].strip()
    result_dir = current_dir + '/count_result'
    hadoop_home = os.getenv('HADOOP_HOME').strip()
    stream_jar = glob.glob(hadoop_home + '/contrib//streaming/hadoop-streaming*.jar')[0]
    mapper_file = current_dir + '/count_mapper_hive.py'
    reducer_file = current_dir + '/count_reducer_hive.py'
    
    ### clean tweets, output to ft1.txt and count for unicode
    OUT1 = open(outfile1, 'w')
    unicode_count = 0
    k = 0
    escapes = ''.join([chr(char) for char in range(1, 32)])
    temp_list = 31 * ' '
    with open(infile) as IN:
        for line in IN:
            (content, time1, unicode_flag) = ut.tweets_cleaned_json(line, escapes, temp_list)
            if not time1:
                continue
            
            if unicode_flag == 1:    unicode_count += 1 
                
            if k == 1:
                OUT1.write("\n" + content + " (timestamp: " + time1 + ")" )
            else:
                k = 1
                OUT1.write(content + " (timestamp: " + time1 + ")" )
     
    OUT1.close()           
    OUT2 = open(outfile2, 'w')                                   ### For convenience of hadoop streaming, we divide ft1.txt 
    OUT2.write(str(unicode_count) + " tweets contained unicode.")    ### into ft1_hadoop.txt (contain only tweet), and unicode_count.txt
    OUT2.close()     
    
    
    ### Extract tag and combine same tag with same time (within 1 minute) 
    input_file = outfile1
    call_str = 'hadoop jar ' + stream_jar + ' -mapper ' + mapper_file + ' -reducer ' + reducer_file + \
               ' -file ' + mapper_file + ' -file ' + reducer_file + ' -input ' + input_file + ' -output ' + \
               result_dir + ' -mapper cat -reducer aggregate'
               
    call_list = call_list = ['hadoop', 'jar', stream_jar, '-mapper', mapper_file, '-reducer', reducer_file, 
                             '-file', mapper_file, '-file', reducer_file, '-input', input_file, '-output', 
                             result_dir, '-mapper', 'cat', '-reducer', 'aggregate']
    print call_str
    subprocess.Popen(call_list).wait()
    

    ### Hive data warehouse building
    from hive_service import ThriftHive
    from thrift import Thrift
    from thrift.transport import TSocket
    from thrift.transport import TTransport
    from thrift.protocol import TBinaryProtocol
    
    input_hive = result_dir + "/part-00000"
    if os.path.isfile(result_dir+"/_SUCCESS") == True:
        try:
            transport = TSocket.TSocket('localhost', 10000)
            transport = TTransport.TBufferedTransport(transport)
            protocol = TBinaryProtocol.TBinaryProtocol(transport)
 
            client = ThriftHive.Client(protocol)
            transport.open()
            print "Create table from thrift require change in MySQL as below:"
            print "alter table SDS alter column IS_STOREDASSUBDIRECTORIES set default  0;"            
            
            client.execute("drop table if exists edge_source")                             
            client.execute("""CREATE EXTERNAL TABLE if NOT EXISTS edge_source(year INT, month INT, day INT,
                           hour INT, minute INT, tag1 STRING, tag2 STRING, count INT) 
                           ROW FORMAT DELIMITED FIELDS TERMINATED by ','""")
            call_str = "LOAD DATA LOCAL INPATH '" + input_hive + "' INTO TABLE edge_source"
            client.execute(call_str)
                           
            client.execute("drop table if exists edge")          
            client.execute("""CREATE TABLE if NOT EXISTS edge(tag1 STRING, tag2 STRING, count INT) 
                           PARTITIONED by (year INT, month INT, day INT, hour INT, minute INT)
                           ROW FORMAT DELIMITED FIELDS TERMINATED by ','""")
                           
            client.execute("set hive.exec.dynamic.partition=true")
            client.execute("set hive.exec.dynamic.partition.mode=nonstrict")
            client.execute("""INSERT OVERWRITE TABLE edge PARTITION (year, month, day, hour, minute) 
                           SELECT tag1, tag2, count, year, month, day, hour, minute FROM edge_source""")
            
            
            #client.execute("select * from edge")
            #print client.fetchAll()
        
        except Thrift.TException, tx:
            print "Error occurs!"
            print '%s' % (tx.message)
def main(argv):
    if len(argv) != 5:
        print "5 args required!"
        print "Usage code_challenge_hadoop.py input.txt output1.txt output2.txt output3.txt"
        exit(1)

    current_dir = os.getcwd()
    infile = sys.argv[1].strip()
    outfile1 = sys.argv[2].strip()
    outfile2 = sys.argv[3].strip()
    outfile3 = sys.argv[4].strip()
    result_dir = current_dir + "/count_result"
    hadoop_home = os.getenv("HADOOP_HOME").strip()
    stream_jar = glob.glob(hadoop_home + "/contrib//streaming/hadoop-streaming*.jar")[0]
    mapper_file = current_dir + "/count_mapper.py"
    reducer_file = current_dir + "/count_reducer.py"

    ### clean tweets, output to ft1.txt and count for unicode
    OUT1 = open(outfile1, "w")
    unicode_count = 0
    k = 0
    escapes = "".join([chr(char) for char in range(1, 32)])
    temp_list = 31 * " "
    with open(infile) as IN:
        for line in IN:
            (content, time1, unicode_flag) = ut.tweets_cleaned_json(line, escapes, temp_list)
            if not time1:
                continue

            if unicode_flag == 1:
                unicode_count += 1

            if k == 1:
                OUT1.write("\n" + content + " (timestamp: " + time1 + ")")
            else:
                k = 1
                OUT1.write(content + " (timestamp: " + time1 + ")")

    OUT1.close()
    OUT2 = open(outfile2, "w")  ### For convenience of hadoop streaming, we divide ft1.txt
    OUT2.write(
        str(unicode_count) + " tweets contained unicode."
    )  ### into ft1_hadoop.txt (contain only tweet), and unicode_count.txt
    OUT2.close()

    ### Extract tag and combine same tag with same time (within 1 minute)
    input_file = outfile1
    call_str = (
        "hadoop jar "
        + stream_jar
        + " -mapper "
        + mapper_file
        + " -reducer "
        + reducer_file
        + " -file "
        + mapper_file
        + " -file "
        + reducer_file
        + " -input "
        + input_file
        + " -output "
        + result_dir
        + " -mapper cat -reducer aggregate"
    )

    call_list = call_list = [
        "hadoop",
        "jar",
        stream_jar,
        "-mapper",
        mapper_file,
        "-reducer",
        reducer_file,
        "-file",
        mapper_file,
        "-file",
        reducer_file,
        "-input",
        input_file,
        "-output",
        result_dir,
        "-mapper",
        "cat",
        "-reducer",
        "aggregate",
    ]
    print call_str
    subprocess.Popen(call_list).wait()

    if os.path.isfile(result_dir + "/_SUCCESS") == True:
        period = 10  ### update average degree of last 10 min every 1 min
        tag_dict = {}
        tag_list = []
        tag_dq = collections.deque([], maxlen=period)
        last_time = ""
        N_v, d_total, k = 0, 0, 0
        OUT3 = open(outfile3, "w")
        with open(result_dir + "/part-00000") as IN:
            for line in IN:
                temp_list = line.split("\\t")
                time_str, tag1, tag2, count = temp_list[0], temp_list[1], temp_list[2], temp_list[3]
                timestamp = datetime.strptime(time_str, "%Y-%m-%d %H:%M")

                if k == 0:
                    k = 1
                    last_time = timestamp

                if timestamp == last_time:
                    if tag1 and tag2:
                        tag_list.append((tag1, tag2, int(count)))  ### if same time and not empty, add in tag list
                else:
                    N_v, d_total = ut.average_degree_hadoop(tag_list, last_time, N_v, d_total, tag_dict, tag_dq, period)
                    if N_v == 0:
                        av_degree = 0
                    else:
                        av_degree = float(d_total) / N_v
                    tag_list = [(tag1, tag2, int(count))]

                    OUT3.write(last_time.strftime("%Y-%m-%d %H:%M") + "\t%.2f\n" % av_degree)
                    last_time = timestamp

        IN.close()
        N_v, d_total = ut.average_degree_hadoop(tag_list, last_time, N_v, d_total, tag_dict, tag_dq, period)
        if N_v == 0:
            av_degree = 0
        else:
            av_degree = float(d_total) / N_v
        OUT3.write(last_time.strftime("%Y-%m-%d %H:%M") + "\t%.2f" % av_degree)
        OUT3.close()

    else:
        print "Error occurs in map reduce!"
        exit(1)