def merge_data_and_labels(labels_file, data_file, out_file):
    # load data set
    X_labels, Y_labels = load_data(labels_file)
    X_data, Y_data = load_data(data_file)

    with codecs.open(out_file, "wb", "utf-8") as f:
        for i in range(len(X_data)):
            f.write(u"{}\t{}\n".format(X_data[i].decode("utf-8"), Y_labels[i]))
def get_emotion_body_part_pairs(file_name):
    # load data set
    X_data, Y_data = load_data(file_name)

    Y = [s.split('_') for s in Y_data]

    emotions2body = {}
    emotions = Counter()

    for labelset in Y:
        body_parts = [lb for lb in labelset if lb in heem_body_part_labels]
        emotion_lbls = [lb for lb in labelset if lb in heem_emotion_labels]

        if body_parts and emotion_lbls:
            for em in emotion_lbls:
                for bp in body_parts:
                    if not emotions2body.get(em):
                        emotions2body[em] = Counter()
                    emotions2body[em][bp] += 1
                    emotions[em] += 1
    return emotions, emotions2body
    args = parser.parse_args()

    input_dir = args.input_dir
    output_dir = args.output_dir

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    pairs = Counter()

    text_files = glob.glob(os.path.join(input_dir, "*.txt"))
    for i, text_file in enumerate(text_files):
        print "({} of {}) {}".format((i + 1), len(text_files), text_file)
        text_id = text_file[-17:-4]

        X_data, Y_data = load_data(text_file)

        out_file = os.path.join(output_dir, os.path.basename(text_file))
        for j, predicted in enumerate(Y_data):
            lbs = set(predicted.split("_")) - {"None"}
            emotion_labels = [l for l in lbs if l in heem_emotion_labels]
            ct_labels = [l for l in lbs if l in heem_concept_type_labels]
            if emotion_labels and ct_labels:
                for e in emotion_labels:
                    for ct in ct_labels:
                        pairs["{}\t{}".format(e, ct)] += 1

        with codecs.open(out_file, "wb", "utf-8") as f:
            for pair, freq in pairs.most_common():
                f.write("{}\t{}\t{}\n".format(text_id, pair, freq))
    parser.add_argument('in_dir', help='the directory containing the'
                        ' files with the correct labels.')
    parser.add_argument('out_file', help='csv file to write the output to')
    args = parser.parse_args()

    corpus = args.corpus
    in_dir = args.in_dir
    out_file = args.out_file

    data = {'#lines': [], '#emotional': [], 'avg_labels': []}
    index = []

    # get # of lines`
    text_files = glob.glob('{}/*.txt'.format(in_dir))
    for t in text_files:
        text_id = os.path.basename(t).replace('.txt', '')
        index.append(text_id)
        X_data, Y_data = load_data(t)
        data['#lines'].append(count_lines(t))
        data['#emotional'].append(num_emotional_sentences(Y_data))
        data['avg_labels'].append(average_number_of_labels(Y_data))
    df = pd.DataFrame(data=data, index=index)

    # get time period
    corpus = pd.read_csv(corpus, sep='\t', header=None, index_col=0)
    corpus.loc[:, 'period'] = corpus.apply(get_tp, axis=1)

    # write result to file
    result = pd.concat([df, corpus['period']], axis=1)
    result.to_csv(out_file)
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('input_dir', help='the directory where the input text '
                        'files can be found.')
    parser.add_argument('output_dir', help='the directory where the output '
                        'files should be written.')
    args = parser.parse_args()

    input_dir = args.input_dir
    output_dir = args.output_dir

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    text_files = [t for t in os.listdir(input_dir) if t.endswith('.txt')]
    for text_file in text_files:
        in_file = os.path.join(input_dir, text_file)
        x_data, y_data = load_data(in_file)
        labels = [y.split('_') for y in y_data]
        #new_labels = []
        out_file = os.path.join(output_dir, text_file)
        #print out_file
        with codecs.open(out_file, 'wb', 'utf-8') as f:
            for i in range(len(labels)):
                ls = labels[i]
                #new_labels.append([heem_labels_en.get(l, 'None') for l in ls])
                new_labels = [heem_labels_en.get(l, 'None') for l in ls]
                #print ls, new_labels
                f.write(u'{}\t{}\n'.format(x_data[i].decode('utf-8'),
                                           '_'.join(new_labels)))