def main(args):
    if FLAGS.cfg_file:
        print('loading config setting')
        cfg_from_file(FLAGS.cfg_file, cfg)
    cfg.MAX_STEP = 50
    cfg.BATCH_SIZE = 1
    cfg.TRAIN_QUEUE_CAPACITY = 10

    if not os.path.isdir(FLAGS.output_dir):
        os.mkdir(FLAGS.output_dir)

    logger = log_helper.get_logger()

    data_pipeline = TFLoadingPipeline(cfg, logger, shuffle=True)
    data_pipeline.setup(FLAGS.sample_path, FLAGS.label_path, cfg.BATCH_SIZE,
                        cfg.TRAIN_QUEUE_CAPACITY)

    with tf.Session() as sess:
        data_pipeline.start(sess)
        for step in xrange(cfg.MAX_STEP):
            image_batch, label_batch = data_pipeline.load_batch()

            logger.info('output {}th image for validation'.format(step))
            out_fname = '{}/{}.png'.format(FLAGS.output_dir, step)
            image = image_batch[0].astype(np.uint8)
            r, g, b = cv2.split(image)
            image = cv2.merge((b, g, r))
            label = label_batch[0].astype(np.uint8)
            mask = image.copy()
            mask[:, :, 1][label[:, :, 0] > 0] = 255
            overlay = cv2.addWeighted(image, 0.5, mask, 0.5, 0)
            cv2.imwrite(out_fname, overlay)

        data_pipeline.shutdown()
Exemple #2
0
 def crawl(self):
     """
     一个子进程执行的爬取任务
     流程:
     1.从共享url队列中取出一个url, 若无则使用搜索引擎获取更多起始地址
     2.使用request_url函数获取网页response
     3.使用ParseHelper中的解析函数解析网页
     4.将数据存储到MongoDB数据库中
     5.将网页中解析出来的url放入共享队列
     6.记录日志
     [此方案需可以改进的地方:将request请求url部分与后续处理部分分离,
     采用异步HTTP请求的方式进一步爬取提高效率(1,2)(3,4,5)分离]
     :parameter logger: 日志生成对象,默认过滤级别为logging.INFO
     :return: None
     """
     queue = get_queue_object(self.queue_type)
     pipline = get_pipline_object(self.pipline_type)
     logger = get_logger('blockchain_spider', to_file=True, filename='spider')
     while True:
         url = queue.get_url_from_queue()
         response = request_url(url, timeout=self.timeout)
         first_parsed_data = ParseHelper.first_parse_response(response, keyword=self.keyword)
         new_urls = first_parsed_data['urls'] if first_parsed_data else None
         pipline.save_html_data(first_parsed_data)
         url_amount = queue.put_urls_in_queue(new_urls)
         logger.info(f"{url} has been crawled.")
         if url_amount:
             logger.info(f"There are {url_amount} urls in queue now.")
Exemple #3
0
def main(args):

    if FLAGS.cfg_file:
        print('loading config setting')
        cfg_from_file(FLAGS.cfg_file, cfg)
    print_config(cfg)

    logger = log_helper.get_logger()
    logger.info("show information about {}:".format(FLAGS.model))
    if FLAGS.model == 'res50':
        model = Res50DispNet(cfg, logger)
    else:
        logger.error('wrong model type: {}'.format(FLAGS.model))
        sys.exit(-1)
def main():
    args = parser.parse_args()

    image_list = []
    with open(os.path.join(args.in_samples, args.in_list)) as f:
        for line in f:
            image_list.append(
                os.path.join(args.in_samples,
                             line.split('.png')[-2] + '.png'))

    logger = log_helper.get_logger()
    begin_ts = time.time()
    total_time_elapsed, eval_rets = eval_with_model(
        args.model_file, image_list, args.in_samples, args.out_infer,
        args.out_eval, args.max_pixel_dis, logger)
    end_ts = time.time()
    logger.info("total pipeline time elapsed: {} s".format(end_ts - begin_ts))
    logger.info("total infer time elapsed: {} s".format(total_time_elapsed))
    ave_time_elapsed = total_time_elapsed / len(image_list)
    logger.info("average infer time elapsed: {} s".format(ave_time_elapsed))

    for i in range(3):
        eval_rets_i = [
            eval_rets[x * 4 + i] for x in range(len(eval_rets) // 4)
        ]
        ag_ret = dict(count=len(eval_rets_i),
                      metrics=calculate(aggregate_results(eval_rets_i)),
                      commpare_list=args.in_list,
                      out_folder=args.out_eval)
        json.dump(ag_ret,
                  open(os.path.join(args.out_eval, str(i), 'L4E_result.json'),
                       'w'),
                  indent=2)

    eval_rets_overall = [
        eval_rets[x * 4 + 3] for x in range(len(eval_rets) // 4)
    ]
    ag_ret = dict(count=len(eval_rets_overall),
                  metrics=calculate(aggregate_results(eval_rets_overall)),
                  commpare_list=args.in_list,
                  out_folder=args.out_eval)
    json.dump(ag_ret,
              open(os.path.join(args.out_eval, 'overall', 'L4E_result.json'),
                   'w'),
              indent=2)
import json

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import file_helper
from utils import scikit_ml_helper

from processors.processor import Processor
from utils import log_helper

log = log_helper.get_logger("AmazonLineProcessorTFIDF")


class AmazonLineProcessorTfIdf(Processor):
    def __init__(self, labeled_articles_source_file_path,
                 doc2vec_model_file_path, ml_model_file_path,
                 articles_source_file_path, shuffle_count,
                 classification_sources_file_path):
        self.labeled_articles_file_path = labeled_articles_source_file_path
        self.articles_source_file_path = articles_source_file_path
        self.doc2vec_model_file_path = doc2vec_model_file_path
        self.ml_model_file_path = ml_model_file_path
        self.shuffle_count = shuffle_count
        self.classification_sources_file_path = classification_sources_file_path

    def process(self):

        log.info("Commencing execution")

        with open(self.classification_sources_file_path) as source_cfg:
            sources_dict = json.load(source_cfg)
    "/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/" + \
    "ResearchProject/Veriday/2class/models/doc2vec.model"

ml_model_path = \
    "/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/" + \
    "ResearchProject/Veriday/2class/models/ml.model.d2v.logreg"

veriday_articles_path = \
    "/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/" + \
    "ResearchProject/Veriday/annotated/all_articles.json"

veriday_predicted_articles_path = \
    "/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/" + \
    "ResearchProject/Veriday/annotated/all_articles_predicted.json"

log = log_helper.get_logger("VeridayPredict2Class")


def load_models():
    doc2vec_model = Doc2Vec.load(doc2vec_model_path)
    ml_model = scikit_ml_helper.get_model_from_disk(ml_model_path)
    return doc2vec_model, ml_model


log.info("Begun execution")
doc2vec_model, ml_model = load_models()
log.info("Models loaded")

with open(veriday_articles_path) as veriday_articles_file:
    veriday_articles = json.load(veriday_articles_file)
Exemple #7
0
def main(args):

    if FLAGS.cfg_file:
        print('loading config setting')
        cfg_from_file(FLAGS.cfg_file, cfg)
    print_config(cfg)

    output_path = FLAGS.output_path
    mask_path = FLAGS.mask_path
    if not os.path.isdir(output_path):
        os.makedirs(output_path)
    if not os.path.isdir(mask_path):
        os.makedirs(mask_path)

    image_h = cfg.IMAGE_HEIGHT
    image_w = cfg.IMAGE_WIDTH

    logger = log_helper.get_logger()

    # We use our "load_graph" function
    logger.info("accessing tf graph")
    graph = load_graph(FLAGS.graph_name)

    if FLAGS.verbose:
        # We can verify that we can access the list of operations in the graph
        for op in graph.get_operations():
            logger.info(op.name)
            # prefix/Placeholder/inputs_placeholder
            # ...
            # prefix/Accuracy/predictions
        
    # We access the input and output nodes 
    input_img = graph.get_tensor_by_name('import/input/image:0')
    pred = graph.get_tensor_by_name('import/output/prob:0')

    # launch a Session
    with tf.Session(graph=graph) as sess:

        total_time_elapsed = 0.0

        for image, fname in instance_generator(FLAGS.sample_path):
            logger.info("predicting for {}".format(fname))

            begin_ts = time.time()
            feed_dict = {
                input_img: image[np.newaxis],
            }

            # Note: we didn't initialize/restore anything, everything is stored in the graph_def
            prediction = sess.run(pred, feed_dict=feed_dict)
            end_ts = time.time()
            logger.info("cost time: {} s".format(end_ts - begin_ts))
            total_time_elapsed += end_ts - begin_ts

            # output_image to verify
            output_fname = output_path + "/" + os.path.basename(fname)
            pred_img = np.reshape(prediction, (image_h, image_w, cfg.NUM_CLASSES))
            pred_prob = genPredProb(pred_img, cfg.NUM_CLASSES)
            ret = cv2.imwrite(output_fname, pred_prob)
            if not ret:
                logger.error('writing image to {} failed!'.format(output_fname))
                sys.exit(-1)

            # masking image
            mask_fname = mask_path + "/" + os.path.basename(fname)
            r, g, b = cv2.split(image.astype(np.uint8))
            cv_img = cv2.merge([b, g, r])
            masked = image_process.prob_mask(cv_img, pred_prob)
            ret = cv2.imwrite(mask_fname, masked)
            if not ret:
                logger.error('writing image to {} failed!'.format(output_fname))
                sys.exit(-1)

        print("total time elapsed: {} s".format(total_time_elapsed))
Exemple #8
0
from sklearn import model_selection, linear_model, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

from processors.processor import Processor
from utils import file_helper
from utils import log_helper
from utils.ml_helper import train_xgboost_regressor
from utils.evaluation_helper import evaluate_task_score

log = log_helper.get_logger("TFIDFProcessor")


class TFIDFProcessor(Processor):
    def process(self):
        log.info("Began Processing")

        if self.options.validate:
            x_train_articles, y_train = file_helper.get_article_details(
                self.options.train_headlines_data_path)
            x_test_articles, y_test = file_helper.get_article_details(
                self.options.test_headlines_data_path)

            log.info("Extracting articles and scores")
            x_train_articles.extend(x_test_articles)
            y_train.extend(y_test)

            vectorizer = TfidfVectorizer(sublinear_tf=True,
import json

from gensim.models.doc2vec import TaggedDocument
from nltk import sent_tokenize, word_tokenize
from utils.options import Options

from utils import log_helper

log = log_helper.get_logger("ReviewFile_Helper")


def parse_review_file():
    """
    Parses the input review file
    :return: a list of TaggedDocs for Doc2Vec and a dict of scores
    """

    tagged_reviews = list()
    rating_dict = dict()
    for review in open(Options.options.input_file_path):
        identifier, tagged_review, rating = parse_review(json.loads(review))

        tagged_reviews.append(tagged_review)
        rating_dict[identifier] = rating

    return tagged_reviews, rating_dict


def parse_review(review):
    """
    :param review: JSON object containing an Amazon review
from gensim.models.doc2vec import TaggedLineDocument

from processors.processor import Processor
from utils import doc2vec_helper
from utils import log_helper
from utils import scikit_ml_helper
from sklearn import metrics

log = log_helper.get_logger("FactCheckProcessor")


class FactCheckProcessorDocvec(Processor):

    def __init__(self, labeled_articles_source_file_path, doc2vec_model_file_path, ml_model_file_path,
                 articles_source_file_path, shuffle_count, classification_sources_file_path):
        self.labeled_articles_file_path = labeled_articles_source_file_path
        self.articles_source_file_path = articles_source_file_path
        self.doc2vec_model_file_path = doc2vec_model_file_path
        self.ml_model_file_path = ml_model_file_path
        self.shuffle_count = shuffle_count
        self.classification_sources_file_path = classification_sources_file_path
        self.samples_per_class_train = 680
        self.samples_per_class_test = 50

    def process(self):

        log.info("Commencing execution")

        tagged_docs = TaggedLineDocument(self.labeled_articles_file_path)

        log.info("Training Doc2Vec model")
Exemple #11
0
from sklearn import metrics

from entities.fpb_tagged_line_document import FPBTaggedLineDocument
from processors.processor import Processor
from utils import doc2vec_helper
from utils import evaluation_helper
from utils import file_helper
from utils import log_helper
from utils import ml_helper

log = log_helper.get_logger("FPBDocvecProcessor")


class FPBDocvecProcessor(Processor):
    def process(self):
        log.info("Began Processing")

        fpb_training_docs = FPBTaggedLineDocument(
            self.options.fpb_sentences_file_path)

        doc2vec_model = \
            doc2vec_helper.init_model(
                fpb_training_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count
            )
        log.info("Doc2vec model initialized with " +
                 str(self.options.docvec_dimension_size) + " dimensions and " +
                 str(self.options.docvec_iteration_count) + " iterations")
        label_list = fpb_training_docs.get_label_list()

        log.info("Re-training document vectors")
        x_train = list()
from processors.processor import Processor
from utils import log_helper, file_helper, doc2vec_helper, scikit_ml_helper

log = log_helper.get_logger("AmazonProcessor")


class AmazonProcessor(Processor):
    def __init__(self, labeled_articles_source_file_path,
                 doc2vec_model_file_path, ml_model_file_path,
                 articles_source_file_path):
        self.labeled_articles_file_path = labeled_articles_source_file_path
        self.articles_source_file_path = articles_source_file_path
        self.doc2vec_model_file_path = doc2vec_model_file_path
        self.ml_model_file_path = ml_model_file_path
        self.shuffle_count = 5

    def process(self):

        log.info("Commencing execution")

        # Get tagged articles from Veriday
        log.info("Getting tagged Veriday articles ... ")
        veriday_articles_raw = file_helper.get_articles_list(
            self.articles_source_file_path)
        veriday_tagged_articles = doc2vec_helper.get_tagged_articles_veriday(
            veriday_articles_raw)

        log.info("Getting tagged Amazon reviews ... ")
        tagged_articles, sentiment_scores_dict = \
            doc2vec_helper.get_tagged_amazon_reviews(self.labeled_articles_file_path)
import json

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from utils import file_helper
from utils import scikit_ml_helper

from processors.processor import Processor
from utils import log_helper

log = log_helper.get_logger("AmazonLineProcessorBigram")


class AmazonLineProcessorBigram(Processor):
    def __init__(self, labeled_articles_source_file_path,
                 doc2vec_model_file_path, ml_model_file_path,
                 articles_source_file_path, shuffle_count,
                 classification_sources_file_path):
        self.labeled_articles_file_path = labeled_articles_source_file_path
        self.articles_source_file_path = articles_source_file_path
        self.doc2vec_model_file_path = doc2vec_model_file_path
        self.ml_model_file_path = ml_model_file_path
        self.shuffle_count = shuffle_count
        self.classification_sources_file_path = classification_sources_file_path

    def process(self):

        log.info("Commencing execution")

        with open(self.classification_sources_file_path) as source_cfg:
            sources_dict = json.load(source_cfg)
Exemple #14
0
def main(args):
    checkArgs()

    if FLAGS.cfg_file:
        print('loading config setting')
        cfg_from_file(FLAGS.cfg_file, cfg)
    if FLAGS.stereo_path != '':
        cfg.DO_STEREO = True
    else:
        cfg.DO_STEREO = False

    base_path = None
    title_str = "{:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}".format(
        'ratio', 'abs_rel_i', 'sq_rel_i', 'rmse_i', 'rmse_log_i', 'd1_all_i',
        'a1_i', 'a2_i', 'a3_i', 'abs_rel', 'sq_rel', 'rmse', 'rmse_log',
        'd1_all', 'a1', 'a2', 'a3')
    if FLAGS.base_path != '':
        base_path = FLAGS.base_path
        title_str = "{:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}".format(
            'ratio', 'abs_rel', 'sq_rel', 'rmse', 'rmse_log', 'd1_all', 'a1',
            'a2', 'a3', 'abs_rel_b', 'sq_rel_b', 'rmse_b', 'rmse_log_b',
            'd1_all_b', 'a1_b', 'a2_b', 'a3_b')

    stereo_path = FLAGS.stereo_path if cfg.DO_STEREO else None

    cfg.BATCH_SIZE = 1
    if FLAGS.do_pp and not cfg.DO_STEREO:
        cfg.BATCH_SIZE = 2

    print_config(cfg)

    if FLAGS.output_path != '':
        output_path = FLAGS.output_path
        if not os.path.isdir(output_path):
            os.mkdir(output_path)

    logger = log_helper.get_logger()
    if FLAGS.model == 'res50':
        model = Res50DispNet(cfg, logger)
    else:
        logger.error('wrong model type: {}'.format(FLAGS.model))
        sys.exit(-1)

    # get moving avg
    if FLAGS.use_avg:
        variable_averages = tf.train.ExponentialMovingAverage(
            cfg.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
    else:
        saver = tf.train.Saver(model.all_variables)

    total_time_elapsed = 0
    with tf.Session() as sess:
        # restore model
        logger.info("restoring model ......")
        saver.restore(sess, FLAGS.ckpt_path)

        rate_list = []
        rmse_inter_list = []
        rmse_log_inter_list = []
        abs_rel_inter_list = []
        sq_rel_inter_list = []
        d1_all_inter_list = []
        a1_inter_list = []
        a2_inter_list = []
        a3_inter_list = []
        rmse_list = []
        rmse_log_list = []
        abs_rel_list = []
        sq_rel_list = []
        d1_all_list = []
        a1_list = []
        a2_list = []
        a3_list = []

        for image, label, fname in instance_label_generator(
                FLAGS.sample_path,
                FLAGS.label_path,
                cfg.IMAGE_WIDTH,
                cfg.IMAGE_HEIGHT,
                FLAGS.do_pp,
                stereo_path,
                base_path=base_path):
            if cfg.DO_STEREO:
                sample_name = fname[0]
                stereo_name = fname[1]
                logger.info("testing for {} & {}".format(fname[0], fname[1]))
                feed_dict = {
                    model.left_image: image[0],
                    model.right_image: image[1]
                }
                fname = sample_name
            else:
                logger.info("testing for {}".format(fname))
                if base_path is None:
                    feed_dict = {model.left_image: image}
                else:
                    feed_dict = {model.left_image: image[0]}

            begin_ts = time.time()

            pre_disp = sess.run(model.left_disparity[0], feed_dict=feed_dict)

            end_ts = time.time()
            logger.info("cost time: {} s".format(end_ts - begin_ts))
            total_time_elapsed += end_ts - begin_ts

            if FLAGS.do_pp and not cfg.DO_STEREO:
                disp = post_process_disparity(pre_disp.squeeze())
            else:
                disp = pre_disp[0].squeeze()

            base_disp = None if base_path is None else image[-1]

            width = label.shape[1]
            focal = KITTI_FOCAL[width]
            base = KITTI_BASE
            rate, d1_all_inter, abs_rel_inter, sq_rel_inter, rmse_inter, rmse_log_inter, a1_inter, a2_inter, a3_inter, d1_all, abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = depth_metrics(
                label, disp, focal, base, base_disp)

            print(title_str)
            print(
                "{:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}"
                .format(rate, abs_rel_inter, sq_rel_inter, rmse_inter,
                        rmse_log_inter, d1_all_inter, a1_inter, a2_inter,
                        a3_inter, abs_rel, sq_rel, rmse, rmse_log, d1_all, a1,
                        a2, a3))

            rate_list.append(rate)
            rmse_inter_list.append(rmse_inter)
            rmse_log_inter_list.append(rmse_log_inter)
            abs_rel_inter_list.append(abs_rel_inter)
            sq_rel_inter_list.append(sq_rel_inter)
            d1_all_inter_list.append(d1_all_inter)
            a1_inter_list.append(a1_inter)
            a2_inter_list.append(a2_inter)
            a3_inter_list.append(a3_inter)
            rmse_list.append(rmse)
            rmse_log_list.append(rmse_log)
            abs_rel_list.append(abs_rel)
            sq_rel_list.append(sq_rel)
            d1_all_list.append(d1_all)
            a1_list.append(a1)
            a2_list.append(a2)
            a3_list.append(a3)

            # output_image to verify
            if FLAGS.output_path != '':
                if FLAGS.do_pp and not cfg.DO_STEREO:
                    output_fname = output_path + "/pp_" + os.path.basename(
                        fname)
                else:
                    output_fname = output_path + "/" + os.path.basename(fname)
                plt.imsave(output_fname, disp, cmap=plt.cm.gray)

        rate_mean = np.array(rate_list).mean()
        rmse_inter_mean = np.array(rmse_inter_list).mean()
        rmse_log_inter_mean = np.array(rmse_log_inter_list).mean()
        abs_rel_inter_mean = np.array(abs_rel_inter_list).mean()
        sq_rel_inter_mean = np.array(sq_rel_inter_list).mean()
        d1_all_inter_mean = np.array(d1_all_inter_list).mean()
        a1_inter_mean = np.array(a1_inter_list).mean()
        a2_inter_mean = np.array(a2_inter_list).mean()
        a3_inter_mean = np.array(a3_inter_list).mean()
        rmse_mean = np.array(rmse_list).mean()
        rmse_log_mean = np.array(rmse_log_list).mean()
        abs_rel_mean = np.array(abs_rel_list).mean()
        sq_rel_mean = np.array(sq_rel_list).mean()
        d1_all_mean = np.array(d1_all_list).mean()
        a1_mean = np.array(a1_list).mean()
        a2_mean = np.array(a2_list).mean()
        a3_mean = np.array(a3_list).mean()

        print("============total metric============")
        print(title_str)
        print(
            "{:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}, {:10.3f}"
            .format(rate_mean, abs_rel_inter_mean, sq_rel_inter_mean,
                    rmse_inter_mean, rmse_log_inter_mean, d1_all_inter_mean,
                    a1_inter_mean, a2_inter_mean, a3_inter_mean, abs_rel_mean,
                    sq_rel_mean, rmse_mean, rmse_log_mean, d1_all_mean,
                    a1_mean, a2_mean, a3_mean))

        print("total time elapsed: {} s".format(total_time_elapsed))
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from utils import log_helper

log = log_helper.get_logger("EvaluationHelper")


def evaluate_task_score(y_true, y_pred):

    cosine_smty = \
        cosine_similarity(np.array(y_pred).reshape(1, -1),
                          np.array(y_true).reshape(1, -1))[0][0]

    log.info("Cosine Similarity: " + str(cosine_smty))

    return cosine_smty


Exemple #16
0
from sklearn import linear_model, svm
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

from utils import log_helper

log = log_helper.get_logger("ML_Helper")


def train_linear_model(x, y):

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

    linear_reg_model = linear_model.LinearRegression()
    linear_reg_model.fit(x_train, y_train)

    log.info("Linear Regression accuracy: " + str(linear_reg_model.score(x_test, y_test)))

    return linear_reg_model


def train_svm(x, y):

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

    svm_regressor = svm.LinearSVR()
    svm_regressor.fit(x_train, y_train)

    log.info("SVR accuracy: " + str(svm_regressor.score(x_test, y_test)))

    return svm_regressor
Exemple #17
0
from sklearn import model_selection, linear_model, svm
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor

from processors.processor import Processor
from utils import file_helper
from utils import log_helper
from utils.evaluation_helper import evaluate_task_score
from utils.ml_helper import train_xgboost_regressor

log = log_helper.get_logger("BigramProcessor")
min_ngram_range = range(1, 3)
max_ngram_range = range(1, 3)


class BigramProcessor(Processor):
    def process(self):
        log.info("Began Processing")

        if self.options.validate:
            x_train_articles, y_train = file_helper.get_article_details(
                self.options.train_headlines_data_path)
            x_test_articles, y_test = file_helper.get_article_details(
                self.options.test_headlines_data_path)

            log.info("Extracting articles and scores")
            x_train_articles.extend(x_test_articles)
            y_train.extend(y_test)
Exemple #18
0
from gensim.models.doc2vec import Doc2Vec
from utils.options import Options

from utils import log_helper

log = log_helper.get_logger("Doc2Vec_Helper")


def init_doc2vec_model(tagged_reviews):

    model = Doc2Vec(min_count=25, iter=50, workers=6, size=1000)
    model.build_vocab(tagged_reviews)

    return model


def train_doc2vec_model(doc2vec_model, tagged_reviews):

    shuffle_count = Options.doc2vec_training_count

    for i in range(shuffle_count):
        log.info("Shuffles left: " + str(shuffle_count - i))
        doc2vec_model.train(tagged_reviews)
Exemple #19
0
import sys

from args.options import Options
from argparse import ArgumentParser
from processors.term_scrape_processor import TermScrapeProcessor
from processors.content_scrape_processor import ContentScrapeProcessor
from exceptions.cmdline_exception import CmdLineException
from utils import log_helper

log = log_helper.get_logger("run")


def parse_args(argv):

    parser = ArgumentParser(prog="Investopedia Term Scraper")
    parser.add_argument('--mode', metavar='Term Scrape / Content Scrape', type=str)
    parser.add_argument('--term_indices_file_path', metavar='Term Indices for Investopedia', type=str)
    parser.add_argument('--term_list_file_path', metavar='Term List filepath', type=str)
    parser.add_argument('--output_file_path', metavar='Output File Path', type=str)

    Options.args = parser.parse_args(argv, namespace=Options)


def validate_args(args):

    if args.mode == 'term-scrape':

        if not args.term_indices_file_path:
            msg = "'term-scrape' mode requires 'term_indices_file_path'"
            log.error(msg)
            raise CmdLineException(msg)
Exemple #20
0
import json
from time import time

from utils import doc2vec_helper, ml_helper
from utils.options import Options

from entities.rated_review_document import RatedReviewDocument
from processors.processor import Processor
from utils import log_helper

log = log_helper.get_logger("AmazonReviewProcessor")


class AmazonReviewProcessor(Processor):
    def process(self):

        log.info("Processing begun")

        log.info("Reading input file " + Options.options.input_file_path)
        review_iterator = RatedReviewDocument(Options.options.input_file_path)

        log.info("Building Doc2Vec model")
        start_time = time()
        doc2vec_model = doc2vec_helper.init_doc2vec_model(review_iterator)
        doc2vec_helper.train_doc2vec_model(doc2vec_model, review_iterator)
        time_to_create_docvecs = time() - start_time
        log.info("Doc2Vec model successfully trained")

        ratings_list = list()
        with open(Options.options.input_file_path) as reviews_file:
            for line in reviews_file:
Exemple #21
0
def main(args):

    if FLAGS.cfg_file:
        print('loading config setting')
        cfg_from_file(FLAGS.cfg_file, cfg)

    do_pp = FLAGS.do_pp
    if FLAGS.do_stereo:
        do_pp = False
        cfg.DO_STEREO = True
    else:
        cfg.DO_STEREO = False

    cfg.BATCH_SIZE = 1
    if do_pp:
        cfg.BATCH_SIZE = 2

    print_config(cfg)

    output_path = FLAGS.output_path
    if output_path != '':
        if not os.path.isdir(output_path):
            os.makedirs(output_path)

    logger = log_helper.get_logger()
    do_recon = FLAGS.recon_path != ''
    if do_recon:
        if FLAGS.stereo_path == '':
            logger.error("to do reconstruction, stereo_path has to be set!")
            sys.exit(-1)
        recon_path = FLAGS.recon_path
        if not os.path.isdir(recon_path):
            os.makedirs(recon_path)
    stereo_path = FLAGS.stereo_path

    if FLAGS.model == 'res50':
        model = Res50DispNet(cfg, logger)
    else:
        logger.error('wrong model type: {}'.format(FLAGS.model))
        sys.exit(-1)

    if FLAGS.use_avg:
        # get moving avg
        variable_averages = tf.train.ExponentialMovingAverage(cfg.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
    else:
        saver = tf.train.Saver(model.all_variables)

    with tf.Session() as sess:
        # restore model
        logger.info("restoring model ......")
        saver.restore(sess, FLAGS.ckpt_path)
        total_time_elapsed = 0.0

        aspect_ratio = float(cfg.IMAGE_WIDTH) / cfg.IMAGE_HEIGHT
        for image, fname in instance_generator(FLAGS.sample_path, cfg.IMAGE_WIDTH, cfg.IMAGE_HEIGHT,
                                               do_pp, stereo_path, cfg.DO_STEREO, do_recon):
            if cfg.DO_STEREO or do_recon:
                sample_name = fname[0]
                stereo_name = fname[1]
                logger.info("inference for {} & {}".format(fname[0], fname[1]))
                feed_dict = {
                    model.left_image: image[0],
                    model.right_image: image[1]
                }
                fname = sample_name
            else:
                logger.info("inference for {}".format(fname))
                feed_dict = {
                    model.left_image: image
                }

            begin_ts = time.time()

            if not do_recon:
                pre_disp = sess.run(model.left_disparity[0], feed_dict=feed_dict)
            else:
                pre_disp, recon, recon_diff = sess.run([model.left_disparity[0],
                                                        model.left_reconstruction[0],
                                                        model.left_recon_diff[0]],
                                                        feed_dict=feed_dict)
                recon = recon[0,:,:,:]
                recon_diff = recon_diff[0,:,:,:]

                #print pre_disp.shape
                #print recon.shape
                #print recon_diff.shape

            end_ts = time.time()
            logger.info("cost time: {} s".format(end_ts - begin_ts))
            total_time_elapsed += end_ts - begin_ts

            if do_pp:
                disp = post_process_disparity(pre_disp.squeeze())
            else:
                disp = pre_disp[0].squeeze()

            if FLAGS.resize_ratio != 0 and FLAGS.resize_ratio != 1:
                disp = cv2.resize(disp, (FLAGS.resize_ratio*cfg.IMAGE_WIDTH, FLAGS.resize_ratio*cfg.IMAGE_HEIGHT),
                                  interpolation=cv2.INTER_LINEAR)


            # output disparity
            if output_path != '':
                if do_pp:
                    output_fname = output_path + "/pp_" + os.path.basename(fname)
                else:
                    output_fname = output_path + "/" + os.path.basename(fname)

                plt.imsave(output_fname, disp, cmap=plt.cm.gray)

            if recon_path is not None:
                o_image = cv2.resize(image[0][0],
                                     (FLAGS.resize_ratio*cfg.IMAGE_WIDTH, FLAGS.resize_ratio*cfg.IMAGE_HEIGHT),
                                     interpolation=cv2.INTER_LINEAR)
                o_recon = cv2.resize(recon,
                                     (FLAGS.resize_ratio*cfg.IMAGE_WIDTH, FLAGS.resize_ratio*cfg.IMAGE_HEIGHT),
                                     interpolation=cv2.INTER_LINEAR)
                o_diff = cv2.resize(recon_diff,
                                    (FLAGS.resize_ratio*cfg.IMAGE_WIDTH, FLAGS.resize_ratio*cfg.IMAGE_HEIGHT),
                                    interpolation=cv2.INTER_LINEAR)

                whole_fig = plt.figure(figsize=(int(aspect_ratio*8), 8))
                gs = gridspec.GridSpec(2, 2)
                a = plt.subplot(gs[0, 0])
                b = plt.subplot(gs[1, 0])
                c = plt.subplot(gs[0, 1])
                d = plt.subplot(gs[1, 1])

                a.imshow(o_image)
                a.set_title('raw_image')
                plt.gca().get_xaxis().set_visible(False)
                plt.gca().get_yaxis().set_visible(False)

                b.imshow(disp, cmap=plt.cm.gray)
                b.set_title('disparity')
                plt.gca().get_xaxis().set_visible(False)
                plt.gca().get_yaxis().set_visible(False)

                c.imshow(o_recon)
                c.set_title('reconstruct')
                plt.gca().get_xaxis().set_visible(False)
                plt.gca().get_yaxis().set_visible(False)

                d.imshow(o_diff)
                d.set_title('recon_diff')
                #plt.tight_layout()
                plt.gca().get_xaxis().set_visible(False)
                plt.gca().get_yaxis().set_visible(False)

                output_fname = recon_path + "/" + os.path.basename(fname)
                plt.savefig(output_fname)

                # for release memory
                plt.clf()
                plt.close()

        print("total time elapsed: {} s".format(total_time_elapsed))
 def _get_spider_logger(self):
     """获取爬虫日志对象"""
     return get_logger("spider",
                       to_file=True,
                       to_console=True,
                       filename=self.spider_log_name)
from newspaper import Article

from utils import log_helper

log = log_helper.get_logger(__name__)


def get_article_content(url_list):
    article_tuples = list()

    for url in url_list:
        try:
            article = Article(url)
            article.download()
            article.parse()
            article_tuple = (article.title, article.text)
            article_tuples.append(article_tuple)
        except Exception as e:
            log.error(e)

    return article_tuples


def get_tweet_content(status_list):
    tweets = list()

    for status in status_list:
        tweets.append(status.text)

    return tweets
Exemple #24
0
from sklearn import model_selection, linear_model, svm
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor

from entities.semeval_tagged_line_document import SemevalTaggedLineDocument
from processors.processor import Processor
from utils import doc2vec_helper
from utils import file_helper
from utils import log_helper
from utils.evaluation_helper import evaluate_task_score

log = log_helper.get_logger("DocvecProcessorCrossval")


class DocvecProcessorCrossval(Processor):
    def process(self):
        log.info("Began Processing")

        semeval_train_docs = SemevalTaggedLineDocument(
            self.options.train_headlines_data_path)

        doc2vec_model = \
            doc2vec_helper.init_model(
                semeval_train_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count
            )
        log.info("Doc2vec model initialized with " +
                 str(self.options.docvec_dimension_size) + " dimensions and " +
                 str(self.options.docvec_iteration_count) + " iterations")

        x_articles, y_train = file_helper.get_article_details(
            self.options.train_headlines_data_path)
Exemple #25
0
# -*- coding: utf-8 -*-
"""
发起HTTP  GET请求并接收返回结果
@file: get_helper.py
@time: 2018/10/25 19:20
Created by Junyi.
"""
from requests_html import HTMLSession
from utils.log_helper import get_logger
from utils.decorator import deal_exceptions

requests_logger = get_logger(logger_name='requests_logger', to_console=False,
                             to_file=True, filename='requests')


def is_useful_response(func):
    """
    判断response是否为文本型html的装饰器
    :param func: 需要装饰的函数
    :return: response | None
    """
    def swapper(*args, **kwargs):
        response = func(*args, **kwargs)
        if response.status_code == 200:
            content_type = response.headers['Content-Type']
            if 'text/html' in content_type:
                requests_logger.info(f"Request {response.url} successful!")
            else:
                requests_logger.warning(f"{response.url} is not a text html page!")
                response = None
            return response
Exemple #26
0
from processors.processor import Processor
from utils import log_helper, file_helper, doc2vec_helper, scikit_ml_helper

log = log_helper.get_logger("ModelTrainer")


class ModelTrainer(Processor):

    def __init__(self, labeled_articles_source_file_path, doc2vec_model_file_path, ml_model_file_path,
                 articles_source_file_path, output_file_path):
        self.labeled_articles_file_path = labeled_articles_source_file_path
        self.articles_source_file_path = articles_source_file_path
        self.doc2vec_model_file_path = doc2vec_model_file_path
        self.ml_model_file_path = ml_model_file_path
        self.output_file_path = output_file_path
        self.shuffle_count = 100

    def process(self):

        log.info("Commencing execution")

        # Get tagged articles from Veriday
        log.info("Getting tagged Veriday articles ... ")
        veriday_articles_raw = file_helper.get_articles_list(self.articles_source_file_path)
        veriday_tagged_articles = doc2vec_helper.get_tagged_articles_veriday(veriday_articles_raw)

        # Convert articles file into a Tagged documents for doc2vec
        log.info("Getting tagged Semeval articles ... ")
        articles = file_helper.get_articles_list(self.labeled_articles_file_path)
        tagged_articles, sentiment_scores_dict = doc2vec_helper.get_tagged_articles_scores(articles)
Exemple #27
0
from processors.processor import Processor
from utils import log_helper, file_helper, doc2vec_helper, scikit_ml_helper

log = log_helper.get_logger("ArticleClassifier")


class ArticleClassifier(Processor):
    def __init__(self, labeled_articles_source_file_path,
                 doc2vec_model_file_path, ml_model_file_path,
                 articles_source_file_path):
        self.labeled_articles_file_path = labeled_articles_source_file_path
        self.articles_source_file_path = articles_source_file_path
        self.doc2vec_model_file_path = doc2vec_model_file_path
        self.ml_model_file_path = ml_model_file_path
        self.shuffle_count = 1

    def process(self):

        log.info("Commencing execution")

        # Get tagged articles from Semeval
        log.info("Getting Semeval articles ... ")
        semeval_articles_raw = file_helper.get_articles_list(
            self.labeled_articles_file_path)
        semeval_tagged_articles, document_sentiment_classes = \
            doc2vec_helper.get_tagged_semeval_articles(semeval_articles_raw)

        # model initialization and vocab building
        log.info("Initializing the doc2vec model ...")
        doc2vec_model = doc2vec_helper.init_model(semeval_tagged_articles)
Exemple #28
0
def main(args):
    if FLAGS.cfg_file:
        print('loading config setting')
        cfg_from_file(FLAGS.cfg_file, cfg)
    cfg.BATCH_SIZE = 1
    print_config(cfg)

    output_path = FLAGS.output_path
    if not os.path.isdir(output_path):
        os.makedirs(output_path)

    batch_size = 1
    image_h = cfg.IMAGE_HEIGHT
    image_w = cfg.IMAGE_WIDTH
    image_c = cfg.IMAGE_DEPTH
    output_name = FLAGS.output_name

    whole_graph_ext = 'pb' if FLAGS.whole_graph_bin else 'pbtxt'
    infer_graph_ext = 'pb' if FLAGS.infer_graph_bin else 'pbtxt'
    whole_graph_name = "{}_whole.{}".format(output_name, whole_graph_ext)
    infer_graph_name = "{}_infer.{}".format(output_name, whole_graph_ext)
    uff_graph_name = "{}_uff.{}".format(output_name, whole_graph_ext)
    output_graph_path = "{}/{}.{}".format(output_path, output_name, infer_graph_ext)
    output_uff_graph_path = "{}/{}_uff.{}".format(output_path, output_name, infer_graph_ext)
    print whole_graph_name
    print infer_graph_name
    print uff_graph_name
    print output_graph_path
    print output_uff_graph_path

    # We clear devices to allow TensorFlow to control on which device it will load operations
    clear_devices = True

    # Build graph
    logger = log_helper.get_logger()
    if FLAGS.model == 'sq':
        model = SQSegNet(cfg, logger)
    elif FLAGS.model == 'erf':
        model = ERFSegNet(cfg, logger)

    output_node_names = "output/prob"


    if FLAGS.restore_avg:
        # get moving avg
        variable_averages = tf.train.ExponentialMovingAverage(cfg.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
    else:
        saver = tf.train.Saver(model.all_variables)
    saver = tf.train.Saver(variables_to_restore)

    with tf.Session() as sess:
        # Load checkpoint
        whole_graph_def = sess.graph.as_graph_def()

        # fix whole_graph_def for bn
        for node in whole_graph_def.node:
            if node.op == 'RefSwitch':
                node.op = 'Switch'
                for index in xrange(len(node.input)):
                    if 'moving_' in node.input[index]:
                        node.input[index] = node.input[index] + '/read'
            elif node.op == 'AssignSub':
                node.op = 'Sub'
                if 'use_locking' in node.attr: del node.attr['use_locking']
            elif node.op == 'AssignAdd':
                node.op = 'Add'
                if 'use_locking' in node.attr: del node.attr['use_locking']

        print("%d ops in the whole graph." % len(whole_graph_def.node))

        tf.train.write_graph(whole_graph_def, output_path,
                             whole_graph_name, as_text=not FLAGS.whole_graph_bin)

        infer_graph_def = graph_util.extract_sub_graph(whole_graph_def, output_node_names.split(","))
        print("%d ops in the infer graph." % len(infer_graph_def.node))

        tf.train.write_graph(infer_graph_def, output_path,
                             infer_graph_name, as_text=not FLAGS.whole_graph_bin)


        # fix infer_graph_def for bn for converstion to tensorRT uff
        for node in infer_graph_def.node:
            name_fields = node.name.split('/')
            if name_fields[-2] == 'batchnorm':
                if name_fields[-1] == 'add':
                    for index in xrange(len(node.input)):
                        if 'cond/Merge' in node.input[index]:
                            node.input[index] = '/'.join(name_fields[:-2] + ['moving_variance', 'read'])
                if name_fields[-1] == 'mul_2':
                    for index in xrange(len(node.input)):
                        if 'cond/Merge' in node.input[index]:
                            node.input[index] = '/'.join(name_fields[:-2] + ['moving_mean', 'read'])

        uff_graph_def = graph_util.extract_sub_graph(infer_graph_def, output_node_names.split(","))
        print("%d ops in the uff graph." % len(uff_graph_def.node))

        tf.train.write_graph(uff_graph_def, output_path,
                             uff_graph_name, as_text=not FLAGS.whole_graph_bin)

        saver.restore(sess, FLAGS.ckpt_path)

        output_graph_def = graph_util.convert_variables_to_constants(
            sess, # The session is used to retrieve the weights
            whole_graph_def, # The graph_def is used to retrieve the nodes 
            output_node_names.split(",") # The output node names are used to select the usefull nodes
        ) 

        output_uff_graph_def = graph_util.convert_variables_to_constants(
            sess, # The session is used to retrieve the weights
            infer_graph_def, # The graph_def is used to retrieve the nodes 
            output_node_names.split(",") # The output node names are used to select the usefull nodes
        ) 

        # Finally we serialize and dump the output graph to the filesystem
        mode = "wb" if FLAGS.infer_graph_bin else "w"
        with tf.gfile.GFile(output_graph_path, mode) as f:
            if FLAGS.infer_graph_bin:
                f.write(output_graph_def.SerializeToString())
            else:
                f.write(str(output_graph_def))

        print("%d ops in the output graph." % len(output_graph_def.node))

        with tf.gfile.GFile(output_uff_graph_path, mode) as f:
            if FLAGS.infer_graph_bin:
                f.write(output_uff_graph_def.SerializeToString())
            else:
                f.write(str(output_uff_graph_def))

        print("%d ops in the output uff graph." % len(output_uff_graph_def.node))
import json

from args.options import Options
from processors.processor import Processor
from utils import log_helper, scrape_helper

log = log_helper.get_logger("TermScrapeProcessor")


class TermScrapeProcessor(Processor):
    def __init__(self):
        super().__init__()
        self.domain = "http://www.investopedia.com"
        self.root_url = self.domain + "/terms/"
        self.min_term_count = 100

    def process(self):

        log.info("Processing begun")

        with open(Options.args.term_indices_file_path) as indices_file:
            list_of_indices = json.load(indices_file)

        log.info("There are " + str(len(list_of_indices)) + " indices")

        output_file_object = open(Options.args.output_file_path, 'w')

        for index_term in list_of_indices:

            term_set = set()
            log.info("Working on index term " + index_term)
import json

from utils import log_helper

log = log_helper.get_logger("FileHelper")


def get_articles_list(articles_file_path):
    with open(articles_file_path, 'r') as articles_file:
        articles_data = articles_file.read()

    return json.loads(articles_data)


def get_article_details(articles_file_path):

    articles = list()
    sentiment_scores = list()

    semeval_articles = get_articles_list(articles_file_path)

    for semeval_article in semeval_articles:
        if "sentiment" in semeval_article.keys():
            sentiment_scores.append(semeval_article['sentiment'])
        articles.append(semeval_article['title'].replace(semeval_article['company'], "Umbrella Corp"))

    return articles, sentiment_scores


def annotate_test_set(test_headlines_data_path, y_test):