Exemple #1
0
def preprocess(args):
    parser = argparse.ArgumentParser(description="args for preprocess")
    parser.add_argument("data_type", type=str)
    parser.add_argument("input", type=str)
    parser.add_argument("-o",
                        "--out_dir",
                        help="output dir",
                        type=str,
                        default="")
    parser.add_argument("-w",
                        "--workers",
                        help="number of workers",
                        type=int,
                        default=20)
    parser.add_argument("-d",
                        "--debug",
                        help="debug mode",
                        action="store_true",
                        default=False)
    parser.add_argument("-v",
                        "--verbose",
                        help="verbose mode",
                        action="store_true",
                        default=False)

    options = parser.parse_args(args)
    log_config = dict(name=__file__, debug=options.debug)
    out_dir = get_res_filepath(folder=options.out_dir)
    if options.verbose:
        log_config['console_verbosity'] = logging.INFO
    logger = init_log(**log_config)

    processor = get_processor(options.data_type, options.input, out_dir,
                              logger, options.workers)
    processor.start()
Exemple #2
0
def coverage(args):

    parser = argparse.ArgumentParser(description="args for coverage")
    parser.add_argument("-o",
                        "--out_dir",
                        help="output dir",
                        type=str,
                        default="")
    parser.add_argument("-d",
                        "--debug",
                        help="debug mode",
                        action="store_true",
                        default=False)
    parser.add_argument("-v",
                        "--verbose",
                        help="verbose mode",
                        action="store_true",
                        default=False)

    options = parser.parse_args(args)
    log_config = dict(name=__file__, debug=options.debug)
    out_dir = get_res_filepath(options.out_dir)
    if options.verbose:
        log_config['console_verbosity'] = logging.INFO
    logger = init_log(**log_config)

    Coverage(out_dir=out_dir, logger=logger).analyze()
Exemple #3
0
def compare_impl(probability, occurrence, model, output):
    res = dict()
    logging.info("Start comparing...")
    start_ts = time.time()
    for row_idx, row in enumerate(probability):
        word = model.wv.index2word[row_idx]
        top_prob_indices = gensim.matutils.argsort(row, topn=40, reverse=True)
        top_occur_indices = gensim.matutils.argsort(occurrence[row_idx],
                                                    topn=40,
                                                    reverse=True)
        top_prediction = [(model.wv.index2word[index1], float(row[index1]))
                          for index1 in top_prob_indices]
        top_occurrence = [(model.wv.index2word[index1],
                           float(occurrence[row_idx][index1]))
                          for index1 in top_occur_indices]
        res[word] = dict()
        res[word]['most_probable'] = top_prediction
        res[word]['most_occurred'] = top_occurrence
        # res[word]['bhattacharyya'] = bhattacharyya(row, occurrence[row_idx])
        res[word]['cosine'] = cosine(row, occurrence[row_idx])
        if row_idx == len(probability) - 1 or row_idx % 100 == 0:
            current_ts = time.time()
            logging.info(
                "Processed_words: {:d} Progress: {:.02%}  Words/sec: {:.02f}".
                format(row_idx, row_idx / len(probability),
                       row_idx / (current_ts - start_ts)))

    outfile = get_res_filepath(output)
    json.dump(res, open(outfile, 'w'), indent=2)
    logging.info("Job finished, results saved at '{}'".format(outfile))
Exemple #4
0
def get_text_stats(args):

    parser = argparse.ArgumentParser(description="args for preprocess")
    parser.add_argument("data_type",
                        choices=["reddit", "hackforums", "darkode", "nulled"],
                        type=str)
    parser.add_argument("-o",
                        "--out_dir",
                        help="output dir",
                        type=str,
                        default="")
    parser.add_argument("-d",
                        "--debug",
                        help="debug mode",
                        action="store_true",
                        default=False)
    parser.add_argument("-v",
                        "--verbose",
                        help="verbose mode",
                        action="store_true",
                        default=False)

    options = parser.parse_args(args)
    log_config = dict(name=__file__, debug=options.debug)
    out_dir = get_res_filepath(options.out_dir)
    if options.verbose:
        log_config['console_verbosity'] = logging.INFO
    logger = init_log(**log_config)

    TextStats(data_type=options.data_type, out_dir=out_dir,
              logger=logger).analyze()
Exemple #5
0
    def get_coverage(white, dark):
        white_set = white.keys()
        dark_set = dark.keys()
        common_set = white_set & dark_set
        coverage = len(common_set) / len(dark_set)
        dark_total = sum(dark.values())
        common_total = sum([dark[x] for x in common_set])
        print("common unique words coverage: {:.2%} ({}/{})".format(
            coverage, len(common_set), len(dark_set)))
        print("common words coverage: {:.2%} ({}/{})".format(
            (common_total / dark_total), common_total, dark_total))

        missed_words = {x: dark[x] for x in (dark_set - white_set)}
        outfile = get_res_filepath("missed_words.json")
        with open(outfile, 'w') as fd:
            json.dump(missed_words, fd, indent=2)
        print(outfile)
        common_words = {x: dark[x] for x in common_set}
        outfile = get_res_filepath("common_words.json")
        with open(outfile, 'w') as fd:
            json.dump(common_words, fd, indent=2)
        print(outfile)
Exemple #6
0
def stats(args):
    global logger
    parser = argparse.ArgumentParser(description="args for parse_annotated")
    parser.add_argument("-a",
                        "--annotations",
                        help="annotations dir",
                        type=str,
                        default="annotations.json")
    parser.add_argument("-m",
                        "--model",
                        help="model name",
                        type=str,
                        default="forums.it100")
    parser.add_argument("-o",
                        "--out_file",
                        help="output dir",
                        type=str,
                        default="stats")
    # parser.add_argument(
    #     "-w", "--workers", help="number of workers", type=int, default=10)
    parser.add_argument("-d",
                        "--debug",
                        help="debug mode",
                        action="store_true",
                        default=False)
    parser.add_argument("-s",
                        "--sentence",
                        help="output sentence",
                        action="store_true",
                        default=False)
    parser.add_argument("-v",
                        "--verbose",
                        help="verbose mode",
                        action="store_true",
                        default=False)

    options = parser.parse_args(args)

    log_config = dict(name=__file__, debug=options.debug)

    if options.verbose:
        log_config['console_verbosity'] = logging.INFO
    logger = init_log(**log_config)

    annotations = json.load(open(get_res_filepath(fn=options.annotations)))
    stats_impl(annotations=annotations,
               model=options.model,
               out_file=options.out_file,
               sen=options.sentence)
Exemple #7
0
def build_vocab_impl(input, output, min_count):
    # print(options)
    # print(type(options))
    if input and os.path.isfile(input):
        sentences = LineSentence(input, max_sentence_length=10000)
    else:
        print("Error: input file '{}' not found".format(input))
        return 1

    outfile = get_res_filepath(fn=output)
    #  -cbow 0 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 0 -iter 15
    model = gensim.models.Word2Vec(min_count=min_count)
    model.build_vocab(sentences=sentences)

    model.save(outfile)
Exemple #8
0
def prepare(args):
    parser = argparse.ArgumentParser(description="args for prepare")
    parser.add_argument(
        "-i", "--in_dir", help="input dir", type=str, default="")
    parser.add_argument(
        "-o", "--out_dir", help="output dir", type=str, default="")
    parser.add_argument(
        "-t", "--workers", help="number of workers", type=int, default=10)
    parser.add_argument(
        "-f",
        '--forums',
        nargs='+',
        required=True,
        choices=allforums,
        help='specifies target forum(s)')
    parser.add_argument(
        "-d", "--debug", help="debug mode", action="store_true", default=False)
    parser.add_argument(
        "-v",
        "--verbose",
        help="verbose mode",
        action="store_true",
        default=False)

    options = parser.parse_args(args)

    selections = [f for f in options.forums if f in allforums]

    log_config = dict(name=__file__, debug=options.debug)
    out_dir = get_res_filepath(folder=os.path.join('text2data', options.out_dir))
    in_dir = os.path.join(PREPROCESSED_DIR, options.in_dir)
    if options.verbose:
        log_config['console_verbosity'] = logging.INFO
    logger = init_log(**log_config)

    TrainingPrepare(
        in_dir=in_dir,
        out_dir=out_dir,
        logger=logger,
        forums=selections,
        workers=options.workers).go()
Exemple #9
0
def predict_impl(good_model, bad_model, output):
    logging.info("start calculating probability")
    if not good_model.negative or not bad_model.negative:
        raise RuntimeError(
            "We have currently only implemented predict_output_word for the negative sampling scheme, "
            "so you need to have run word2vec with negative > 0 for this to work."
        )

    if not hasattr(bad_model.wv, 'syn0') or not hasattr(good_model, 'syn1neg'):
        raise RuntimeError(
            "Parameters required for predicting the output words not found.")

    syn0 = bad_model.wv.syn0
    syn1 = good_model.syn1neg

    probability = exp(dot(syn0, syn1.T))
    rows, columns = probability.shape
    logging.info("probability matrix rows: {}, columns: {}".format(
        rows, columns))
    sums = np_sum(probability, axis=1)
    logging.info("probability sum matrix shape: {}".format(sums.shape))

    probability = probability / sums[:, None]
    logging.info("probability calculation finished")
    pred_outfile = get_res_filepath(fn="{}.prob.npy".format(output))
    t1 = Thread(target=save, args=(pred_outfile, probability))
    t1.start()

    # logging.info("start occurrence counting")
    # occurrence = zeros((rows, rows))
    # # TODO
    # logging.info("occurrence counting finished")
    # occur_outfile = get_res_filepath(fn="{}.occur.npy".format(output))
    # t2 = Thread(target=save, args=(occur_outfile, occurrence))
    # save(occur_outfile, occurrence)
    # compare_outfile = get_res_filepath(fn="{}.compare.json".format(output))
    # compare_impl(probability, occurrence, bad_model, compare_outfile)

    t1.join()
    logging.info("prediction results saved at '{}'".format(pred_outfile))
Exemple #10
0
def compare_pred_impl(p1, p2, p3, model, output, threads_n):
    progress = AtomicCounter()
    res = dict()
    logging.info("Start comparing...")
    start_ts = time.time()
    threads = []
    batch = math.ceil(len(p1) / threads_n)
    for i in range(threads_n):
        t = Thread(target=compare_pred_thread,
                   args=(res, p1, p2, p3, model, batch * i, batch + batch * i,
                         start_ts, progress))
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    corrs = [x['correlation'] for x in res.values()]
    pvs = [x['pvalue'] for x in res.values()]
    jac40 = list(zip(*[x['jac40'] for x in res.values()]))
    jac100 = list(zip(*[x['jac100'] for x in res.values()]))
    jac1000 = list(zip(*[x['jac1000'] for x in res.values()]))
    prob_std1 = [x['prob_std1'] for x in res.values()]
    prob_std2 = [x['prob_std2'] for x in res.values()]
    prob_std3 = [x['prob_std3'] for x in res.values()]

    stats = dict()
    stats["correlation"] = stat_dict(corrs)
    stats["pvalue"] = stat_dict(pvs)
    stats["jac40"] = stat_dict(jac40[0]), stat_dict(jac40[1])
    stats["jac100"] = stat_dict(jac100[0]), stat_dict(jac100[1])
    stats["jac1000"] = stat_dict(jac1000[0]), stat_dict(jac1000[1])
    stats["prob_std1"] = stat_dict(prob_std1)
    stats["prob_std2"] = stat_dict(prob_std2)
    stats["prob_std3"] = stat_dict(prob_std3)

    outfile = get_res_filepath(output)
    json.dump(dict(stats=stats, details=res), open(outfile, 'w'), indent=2)
    logging.info("Job finished, results saved at '{}'".format(outfile))
Exemple #11
0
def parse_annotated(args):
    global logger
    parser = argparse.ArgumentParser(description="args for parse_annotated")
    parser.add_argument("-i",
                        "--in_dir",
                        help="input dir",
                        type=str,
                        default="")
    parser.add_argument("-o",
                        "--out_file",
                        help="output dir",
                        type=str,
                        default="annotations.json")
    parser.add_argument("-w",
                        "--workers",
                        help="number of workers",
                        type=int,
                        default=10)
    parser.add_argument("-d",
                        "--debug",
                        help="debug mode",
                        action="store_true",
                        default=False)
    parser.add_argument("-v",
                        "--verbose",
                        help="verbose mode",
                        action="store_true",
                        default=False)

    options = parser.parse_args(args)

    log_config = dict(name=__file__, debug=options.debug)
    out_file = get_res_filepath(fn=options.out_file)
    in_dir = os.path.join(PREPROCESSED_DIR, options.in_dir)
    if options.verbose:
        log_config['console_verbosity'] = logging.INFO
    logger = init_log(**log_config)

    parse_annotated_impl(in_dir=in_dir, out_file=out_file)
Exemple #12
0
def train(args):
    # log_config = dict(name=__file__, console_verbosity=logging.INFO)
    # logger = init_log(**log_config)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    text8 = gensim.models.word2vec.Text8Corpus(
        "/u/kanyuan/text8/text8_nonstop", max_sentence_length=10000)

    sentences = list(text8)
    outfile = get_res_filepath(fn="text8_nonstop.model_1")
    #  -cbow 0 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 0 -iter 15
    model = gensim.models.Word2Vec(sentences,
                                   workers=20,
                                   window=10,
                                   negative=25,
                                   sg=1,
                                   size=200,
                                   sample=0.0001,
                                   iter=15,
                                   compute_loss=True)
    model.save(outfile)
Exemple #13
0
import sqlite3
import traceback

from multiprocessing.pool import ThreadPool
from collections import Counter
from nltk.tokenize import sent_tokenize
# from gensim.utils import tokenize

from ..utils.misc import tokenize
from monster.misc import get_res_filepath
from monster.log import init_log
from monster.atomic import AtomicCounter
from .preprocessor import pattern_url, pattern_email, pattern_hash, en_stopwords

PREPROCESSED_DIR = os.path.abspath(
    os.path.join(get_res_filepath(), os.pardir, "preprocessing"))

allforums = ["darkode", "hackforums", "nulled", "silkroad"]

frequent_bar = 10

code_indicators = "(){}[].;\""

class TrainingPrepare(object):
    def __init__(self, in_dir, out_dir, logger, forums, workers):
        self.logger = logger
        self.out_dir = out_dir
        self.db_dir = in_dir
        self.selections = forums
        self.logger.info("Init finished")
        self.pool = ThreadPool(processes=workers)
Exemple #14
0
import json
import argparse
import os
import logging
import gensim
import time
import math
import numpy as np

from threading import Thread
from scipy.stats import spearmanr
from monster.misc import get_res_filepath
from monster.atomic import AtomicCounter

DATA_DIR = os.path.abspath(
    os.path.join(get_res_filepath(), os.pardir, "compare 2 predictions"))


def compare_pred(args):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    parser = argparse.ArgumentParser(description="args for prediction")
    parser.add_argument("--prob1", help="prob1 file", type=str)
    parser.add_argument("--prob2", help="prob2 file", type=str)
    parser.add_argument("--prob3", help="prob3 file", type=str)
    parser.add_argument("-m",
                        "--model",
                        help="eithor good or bad model",
                        type=str)
    parser.add_argument("-t",
Exemple #15
0
def prepare(args):
    parser = argparse.ArgumentParser(description="args for prepare")
    parser.add_argument("-i",
                        "--in_dir",
                        help="input dir",
                        type=str,
                        default="")
    parser.add_argument("-o",
                        "--out_dir",
                        help="output dir",
                        type=str,
                        default="")
    parser.add_argument("-t",
                        "--workers",
                        help="number of workers",
                        type=int,
                        default=10)
    parser.add_argument("-f",
                        '--forums',
                        nargs='+',
                        required=True,
                        choices=all_dark_forums + all_white_forums,
                        help='specifies target forum(s)')
    parser.add_argument("-d",
                        "--debug",
                        help="debug mode",
                        action="store_true",
                        default=False)
    parser.add_argument("-v",
                        "--verbose",
                        help="verbose mode",
                        action="store_true",
                        default=False)

    options = parser.parse_args(args)

    dark_selections = []
    white_selections = []
    wiki = False

    for choice in options.forums:
        if choice == "wiki":
            wiki = True
        elif choice in all_dark_forums:
            dark_selections.append(choice)
        elif choice in all_white_forums:
            white_selections.append(choice)
    log_config = dict(name=__file__, debug=options.debug)
    out_dir = get_res_filepath(folder=options.out_dir)
    in_dir = os.path.join(PREPROCESSED_DIR, options.in_dir)
    if options.verbose:
        log_config['console_verbosity'] = logging.INFO
    logger = init_log(**log_config)

    TrainingPrepare(in_dir=in_dir,
                    out_dir=out_dir,
                    logger=logger,
                    dark=dark_selections,
                    white=white_selections,
                    wiki=wiki,
                    workers=options.workers).go()
Exemple #16
0
 def __init__(self, out_dir, logger):
     self.logger = logger
     self.out_dir = out_dir
     self.vocab_dir = get_res_filepath()
     self.dark_vocabs = dict()
     self.white_vocabs = dict()
Exemple #17
0
def train(args):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    parser = argparse.ArgumentParser(description="args for training")
    parser.add_argument("-g", "--good", help="good corpus file", type=str)
    parser.add_argument("-b", "--bad", help="bad corpus file", type=str)
    parser.add_argument("-o",
                        "--output",
                        help="output filename",
                        type=str,
                        default="word2vec.model")
    parser.add_argument("-s",
                        "--size",
                        help="word vector size",
                        type=int,
                        default=100)
    parser.add_argument("-a",
                        "--alpha",
                        help="initial learning rate",
                        type=float,
                        default=0.025)
    parser.add_argument("-w",
                        "--window",
                        help="window size",
                        type=int,
                        default=5)
    parser.add_argument("-S",
                        "--sample",
                        help="subsampling rate",
                        type=float,
                        default=1e-3)
    parser.add_argument("-T",
                        "--threads",
                        help="thread number",
                        type=int,
                        default=3)
    parser.add_argument("--min_alpha",
                        help="minimal learning rate",
                        type=float,
                        default=0.0001)
    parser.add_argument("--sg",
                        help="skip gram (1) or cbow (0)",
                        type=int,
                        default=0)
    parser.add_argument("--hs",
                        help="using hierarchical softmax (1) or not (0)",
                        type=int,
                        default=0)
    parser.add_argument("-n",
                        "--negative",
                        help="negative sampling",
                        type=int,
                        default=5)
    parser.add_argument("--cbow_mean", help="cbow mean", type=int, default=1)
    parser.add_argument("-i", "--iter", help="iterations", type=int, default=5)
    parser.add_argument("--min_count",
                        help="minimal occurrence of words to be considered",
                        type=int,
                        default=5)

    options = options = parser.parse_args(args)
    vocab = dict()

    logging.info("loading corpus...")
    if options.good and os.path.isfile(options.good):
        good_sentences = list(
            LineSentence(options.good, max_sentence_length=10000))
        vocab['good'] = get_vocab(options.good)
    else:
        logging.error("Error: good corpus file '{}' not found".format(
            options.good))
        return 1

    if options.bad and os.path.isfile(options.bad):
        bad_sentences = list(
            LineSentence(options.bad, max_sentence_length=10000))
        vocab['bad'] = get_vocab(options.bad)

    else:
        bad_sentences = list()

    min_count = options.min_count

    good_outfile = get_res_filepath(fn="{}.good.model".format(options.output))
    bad_outfile = get_res_filepath(fn="{}.bad.model".format(options.output))
    vocab_outfile = get_res_filepath(fn="{}.vocab".format(options.output))

    with open(vocab_outfile, "w") as fd:
        json.dump(vocab, fd)

    good_model = gensim.models.Word2Vec(
        workers=options.threads,
        window=options.window,
        negative=options.negative,
        sg=options.sg,
        size=options.size,
        sample=options.sample,
        min_count=min_count,
        iter=options.iter,
        alpha=options.alpha,
        min_alpha=options.min_alpha,
        hs=options.hs,
        cbow_mean=options.cbow_mean,
    )

    good_model.build_vocab(good_sentences + bad_sentences)
    good_model.train(good_sentences,
                     total_examples=len(good_sentences),
                     epochs=good_model.iter)
    good_model.save(good_outfile)

    if bad_sentences:
        bad_model = gensim.models.Word2Vec(
            workers=options.threads,
            window=options.window,
            negative=options.negative,
            sg=options.sg,
            size=options.size,
            sample=options.sample,
            min_count=min_count,
            iter=options.iter,
            alpha=options.alpha,
            min_alpha=options.min_alpha,
            hs=options.hs,
            cbow_mean=options.cbow_mean,
        )

        bad_model.build_vocab(good_sentences + bad_sentences)
        bad_model.train(bad_sentences,
                        total_examples=len(bad_sentences),
                        epochs=bad_model.iter)
        bad_model.save(bad_outfile)
Exemple #18
0
from gensim.matutils import argsort, unitvec
from gensim.utils import tokenize as tokenize1
from ..utils.misc import tokenize as tokenize2

from collections import defaultdict

from multiprocessing.pool import ThreadPool
from collections import Counter
from nltk.tokenize import sent_tokenize
# from gensim.utils import tokenize

from monster.misc import get_res_filepath
from monster.log import init_log

PREPROCESSED_DIR = os.path.abspath(
    os.path.join(get_res_filepath(), os.pardir, "preprocessing"))

MODEL_DIR = os.path.abspath(
    os.path.join(get_res_filepath(), os.pardir, os.pardir, "word2vec/jargon"))
tokenize = tokenize2


def load_cmodel(model_fn):
    model = dict()
    with open(model_fn) as fd:
        rows, columns = map(int, fd.readline().split())
        for idx, line in enumerate(fd):
            line = line.strip()
            fields = line.split()
            if (len(fields) != columns + 1):
                logging.error("malformatted model file")
Exemple #19
0
def stats_impl(annotations, model, out_file, sen):
    out_file = get_res_filepath(fn="{}.csv".format(out_file))
    out_file2 = get_res_filepath(fn="{}_missed.csv".format(out_file))

    logger.info("init finished")

    cands = list(get_candidates(model))
    words = [x[1] for x in cands]
    if sen:
        logger.info("start preparing data")
        db_dark = prepare_dark(words)
        logger.info("dark data preparing finished")
        db_white = prepare_white(words)
        logger.info("white data preparing finished")
    else:
        db_dark = dict()
        db_white = dict()

    csvfd = open(out_file, 'w')
    spamwriter = csv.writer(csvfd)
    for rank, word, sim, good_interpretation, bad_interpretation, normal_interpretation in cands:
        logger.info("processing word '{}''".format(word))
        good_sen = "...... ".join(db_white.get(word, []))
        bad_sen = "...... ".join(db_dark.get(word, []))
        good_interpretation = [x[0] for x in good_interpretation]
        bad_interpretation = [x[0] for x in bad_interpretation]
        normal_interpretation = [x[0] for x in normal_interpretation]
        if word in annotations:
            labeled = True
            label = label2str(annotations[word])
        else:
            labeled = False
            label = ""

        spamwriter.writerow([
            rank, word, good_interpretation, bad_interpretation, bad_sen,
            normal_interpretation, good_sen, labeled, label
        ])

    csvfd.close()

    csvfd = open(out_file2, 'w')
    spamwriter = csv.writer(csvfd)
    spamwriter.writerow([
        "word", "score", "gcn", "bcn", "good_interpretation",
        "bad_interpretation", "bad_sen", "normal_interpretation", "good_sen",
        "labeled", "label"
    ])
    for word in annotations:
        if word not in words:
            sim, gcn, bcn, good_interpretation, bad_interpretation, normal_interpretation = get_info(
                word)
            logger.info("processing missing word '{}''".format(word))
            # good_sen = "...... ".join(db_white.get(word, []))
            # bad_sen = "...... ".join(db_dark.get(word, []))
            good_interpretation = [x[0] for x in good_interpretation]
            bad_interpretation = [x[0] for x in bad_interpretation]
            normal_interpretation = [x[0] for x in normal_interpretation]
            if word in annotations:
                labeled = True
                label = label2str(annotations[word])
            else:
                labeled = False
                label = ""

            spamwriter.writerow([
                word, sim, gcn, bcn, good_interpretation, bad_interpretation,
                bad_sen, normal_interpretation, good_sen, labeled, label
            ])

    csvfd.close()

    logger.info("data saved at '{}' and '{}'".format(out_file, out_file2))