def run_classifiers_cv(file_names,
                       ptest=0.2,
                       cv=3,
                       max_sentences=4,
                       as_sentences=False,
                       labels=['ACCOUNT']):

    data = load_xlsx_data(file_names,
                          max_sentences=max_sentences,
                          as_sentences=as_sentences,
                          labels=labels)

    vectorizers_dict = gen_tfidf_vectorizers_dict()
    classifiers_dict = gen_classifiers_dict()

    cv_results_df = pd.DataFrame(columns=[
        'cv', 'classifier', 'vectorizer', 'precision', 'recall', 'fscore',
        'label', 'num_train_exs', 'num_test_exs'
    ])

    random.seed(a=3)
    for c in range(cv):
        print("*****************START CV=" + str(c) +
              "************************")
        start_time = time.time()
        rand_int = random.randint(0, 1000)
        data_train, data_test = train_test_split(pd.DataFrame(data),
                                                 test_size=ptest,
                                                 random_state=rand_int)
        for label in labels:
            if sum(data_train[label]) < 2 or sum(data_test[label]) < 2:
                print("less than 2 examples of label: " + label)
            elif any(data_train[label] > 1):  # needs at least 2 examples?
                print("given labels invalid: " + label)
            # elif not any(data_train[label] == 1):
            #     print("No labels of class 1 for label: "+label)
            #     print(data_train[label])
            else:
                print("num train exs: " + str(sum(data_train[label])))
                results = run_classifiers(data_train,
                                          data_test,
                                          vectorizers_dict,
                                          classifiers_dict,
                                          label=label)
                results['cv'] = pd.Series([c] * results.shape[0])
                results['num_train_exs'] = pd.Series([sum(data_train[label])] *
                                                     results.shape[0])
                results['num_test_exs'] = pd.Series([sum(data_test[label])] *
                                                    results.shape[0])
                cv_results_df = cv_results_df.append(results,
                                                     ignore_index=True)
            cv_results_df.to_csv(
                results_path + "cv_" + str(c) + str(label) +
                "_label_fulldata_excerpts_tfidf_cv_results_labels.csv")
        end_time = time.time()
        print("Time for cv=" + str(c) + " : " + str(end_time - start_time))
        cv_results_df.to_csv(results_path + "cv_" + str(c) +
                             "_fulldata_excerpts_tfidf_cv_results_labels.csv")

    return cv_results_df
def produce_visualization(
        file_names=["Isla Vista - All Excerpts - 1_2_2019.xlsx"],
        tokenizer=stem_tokenizer,
        labels=['ACCOUNT', 'HERO'],
        max_sentences=None,
        as_sentences=False,
        output_file='ldavis'):
    data = load_data.load_xlsx_data(file_names,
                                    max_sentences=max_sentences,
                                    as_sentences=as_sentences,
                                    labels=labels)

    excerpts = list(data['Excerpts'])

    # exclude labels with no true label
    keep_labels = []
    for lab in labels:
        if sum(data[lab]) > 0:
            keep_labels.append(lab)
        else:
            print(lab + " label not present in files: " + str(file_names))
    labels = keep_labels

    # create a subset of the data frame that is the account label types
    main_types_df = data[labels]

    main_types_df.index = range(1, main_types_df.shape[0] + 1)

    # drop rows and excerpts with no label
    # build vocab and doc_lengths
    all_words = []
    doc_lengths = []
    main_types_excerpts = []
    for idx, doc in enumerate(excerpts):
        if sum(main_types_df.loc[idx + 1]) < 1:
            # if this document had no main type label
            main_types_df = main_types_df.drop([idx + 1], axis=0)
        else:
            main_types_excerpts.append(doc)
            doc_toks = stem_tokenizer(doc)
            all_words.extend(doc_toks)
            doc_lengths.append(len(doc_toks))
    fdist = FreqDist(all_words)
    fdistmc = fdist.most_common()
    vocab = [word for word, count in fdistmc]
    term_frequency = [count for word, count in fdistmc]
    print("number of labelled documents: " + str(len(doc_lengths)))

    # build topic-term distribution
    stop_words = set(stopwords.words('english'))
    freq_dist_dict = {}
    topic_size = []
    topic_num_words = []
    i = 0
    for coln in main_types_df.columns:
        categ_excerpts = list(
            compress(main_types_excerpts, main_types_df[coln].values))
        exq = [tokenizer(doc) for doc in categ_excerpts]
        excerpt_words = [tok for tok_list in exq for tok in tok_list]
        i = i + 1
        topic_size.append(len(exq))
        topic_num_words.append(len(excerpt_words))
        #print("Topic "+str(i)+": "+coln+" number of excerpts: "+str(len(exq)))
        words = [
            word for word in excerpt_words
            if word.lower() not in stop_words and word.isalpha()
        ]
        freq_dist_dict[coln] = FreqDist(words)

    topic_term_dists = []

    for coln in main_types_df.columns:
        ffdist = freq_dist_dict[coln]
        fdist = [
            ffdist.freq(word) if word in ffdist.keys() else np.nextafter(
                float(0), (1)) for word in vocab
        ]
        #print("categ: "+str(coln)+" len of freq dist "+str(len(fdist))+" sum of vetor: "+str(sum(fdist)))
        topic_term_dists.append([float(i) for i in fdist])

    # Document-topic distribution
    doc_topic_dists = []
    for index, rowi in main_types_df.iterrows():
        row = list(rowi)
        if (sum(row) > 1.01 or sum(row) < 0.99):
            #print(str(index)+" row: "+str(row))
            # normalize row
            row = [r / sum(row) for r in row]
        if (sum(row) == 0):
            print(row)
        doc_topic_dists.append([float(i) for i in row])

    # format for pyLDAvis
    data_dict = {
        'topic_term_dists': topic_term_dists,
        'doc_topic_dists': doc_topic_dists,
        'doc_lengths': doc_lengths,
        'vocab': vocab,
        'term_frequency': term_frequency
    }
    #print('Topic-Term shape: %s' % str(np.array(data_dict['topic_term_dists']).shape))
    #print('Doc-Topic shape: %s' % str(np.array(data_dict['doc_topic_dists']).shape))

    # save data as json
    with open(output_file + '.json', 'w') as json_file:
        json.dump(data_dict, json_file)

    vis_data = pyLDAvis.prepare(**data_dict, n_jobs=-1)

    # order the columns for pyldavis
    col_order = vis_data.topic_order
    categs = list(main_types_df.columns)
    string_list = [""] * len(col_order)
    for idx, i in enumerate(col_order):
        msg = "Topic " + str(idx + 1) + ": " + categs[
            i - 1] + ", number of words: " + str(topic_num_words[i - 1])
        print(msg)
        string_list[idx] = msg

    with open(output_file + '.txt', 'w') as f:
        for msg in string_list:
            f.write("%s\n" % msg)

    pyLDAvis.save_html(vis_data, output_file + '.html')
    #if display:
    #pyLDAvis.display(vis_data)
    return vis_data