Beispiel #1
0
def init_train_data(fnames, topics):
  print ('[ init_train_data ] =================')
  # amap  
  # key : aid 
  # value : attr[0] preferance, attr[1] aid , attr[2] aname

  train_rank = []
  for QID in range(len(topics)):
    fname = fnames[QID]
    topic = topics[QID]

    amap = filter_data(fname)
    fea = Feature(topic)

    ext_aids = ZC.get_raw_rank(topic, EXT_TRAIN_A_SIZE)
    print '[ init_train_data ] amap_1 size = %d ' %(len(amap))

    for tid in ext_aids : 
      if not (tid in amap)  : 
        amap[tid] = (0, tid, '')

    print '[ init_train_data ] amap_2 size = %d ' %(len(amap))
    
    for tid in amap : 
      fv = fea.get_feature_vector(tid)
      #print ('[ init_train_data ] %d get feature vector ok.' %(tid))
      train_rank.append( (int(amap[tid][0]), reform_vector(fv), QID) )

    print '[ init_train_data ]  topic : %s ok , train_rank_size = %d' %(topic, len(train_rank))
    ZC.dump_cache()

  with open('train_rank.dat' , 'w') as f :
    pprint.pprint(train_rank, f)

  return train_rank
def process(data: Data) -> Data:
    result = data

    if result.args.white_black:
        result = make_white_black(result, np.array([0.299, 0.587, 0.114, 1]))

    if result.args.filter:
        result = filter_data(result)

    if result.args.bit:
        result.norm()

    if result.args.sobel:
        result = sobel_filter(data)

    return result
Beispiel #3
0
def init_test_data(fname, topic):
    print('[ init_train_data ] =================')
    QID = 1
    # amap , key : aid
    # value : attr[0] preferance, attr[1] aid , attr[2] aname
    amap = filter_data(fname)
    fea = Feature(topic)
    train_rank = []
    for tid in amap:
        aid = int(tid)
        fv = fea.get_feature_vector(aid)
        print('[ init_train_data ] %d get feature vector ok.' % (aid))
        train_rank.append((aid, reform_vector(fv), QID))
        #ZC.dump_cache()

    return train_rank
Beispiel #4
0
def init_test_data(fname, topic):
  print ('[ init_train_data ] =================')
  QID = 1
  # amap , key : aid 
  # value : attr[0] preferance, attr[1] aid , attr[2] aname
  amap = filter_data(fname)
  fea = Feature(topic)
  train_rank = []
  for tid in amap : 
    aid = int(tid)
    fv = fea.get_feature_vector(aid)
    print ('[ init_train_data ] %d get feature vector ok.' %(aid))
    train_rank.append( (aid, reform_vector(fv), QID) )
    #ZC.dump_cache()


  return train_rank
Beispiel #5
0
def preprocess_data(path_model, model_name, k, path_preprocess):
    path = path_model + model_name

    data = load_file(path)
    data = filter_data(data)
    data = add_tia(data)
    data = normalize_data(data)
    data.to_csv(path_preprocess + 'preprocessed_data_' + model_name, index=0)

    [feature_selected, stat_ranksums] = select_feature(data)
    feature_selected.to_csv(path_preprocess + 'selected_features_' +
                            model_name,
                            index=0)

    sn_folds = kfold_data(data, k)
    sn_folds.to_csv(path_preprocess + 'sn_folds_' + model_name, index=0)

    print('[%s]%s done...' % (time.asctime(time.localtime(
        time.time())), sys._getframe().f_code.co_name))
    return [feature_selected, sn_folds]
Beispiel #6
0
def init_train_data(fnames, topics):
    print('[ init_train_data ] =================')
    # amap
    # key : aid
    # value : attr[0] preferance, attr[1] aid , attr[2] aname

    train_rank = []
    for QID in range(len(topics)):
        fname = fnames[QID]
        topic = topics[QID]

        amap = filter_data(fname)
        fea = Feature(topic)

        ext_aids = ZC.get_raw_rank(topic, EXT_TRAIN_A_SIZE)
        print '[ init_train_data ] amap_1 size = %d ' % (len(amap))

        for tid in ext_aids:
            if not (tid in amap):
                amap[tid] = (0, tid, '')

        print '[ init_train_data ] amap_2 size = %d ' % (len(amap))

        for tid in amap:
            fv = fea.get_feature_vector(tid)
            #print ('[ init_train_data ] %d get feature vector ok.' %(tid))
            train_rank.append((int(amap[tid][0]), reform_vector(fv), QID))

        print '[ init_train_data ]  topic : %s ok , train_rank_size = %d' % (
            topic, len(train_rank))
        ZC.dump_cache()

    with open('train_rank.dat', 'w') as f:
        pprint.pprint(train_rank, f)

    return train_rank
Beispiel #7
0
        'failure',
        'smart_1_normalized',
        'smart_3_normalized',
        'smart_5_normalized',
        'smart_7_normalized',
        'smart_9_normalized',
        'smart_187_normalized',
        'smart_189_normalized',
        'smart_194_normalized',
        'smart_195_normalized',
        'smart_197_normalized',
        'smart_5_raw',
        'smart_197_raw',
    ]

    # Preprocess data
    data['failure'] = (data['failure'] * (-1) + 1) / 2
    data = add_date(data)
    data = filter.filter_data(data, save=1, start_date='2010-01-01')
    data = filter.add_tia(data)
    data = filter.normalize_data(data)
    data.to_csv(path_load + 'preprocessed_data_' + model_name, index=0)

    # Select features
    [feature_selected, stat_ranksums] = filter.select_feature(data)
    feature_selected.to_csv(path_load + 'selected_features_' + model_name,
                            index=0)

    # Build k-fold tags
    sn_folds = filter.kfold_data(data, k)
    sn_folds.to_csv(path_load + 'sn_folds_' + model_name, index=0)
Beispiel #8
0
def pipeline(directory: str,
             frames: List[Frame],
             parser_name: str,
             extract_features: set,
             filter_: dict,
             prune_test_data=True,
             log_data=False):
    filter = copy.deepcopy(filter_)
    # Prune sentences without an LU from the frames
    # TODO: log no. sentences pruned
    filter_faulty_sentences(frames)

    # Filter frames
    (frames, no_filtered_frames_sentences) = filter_data(frames, filter)
    print(f"Filtered data")
    # Add feature representation of each word to each word node
    (no_data_points_features) = create_feature_representation(
        frames, extract_features)
    print(f"Feature representation created")

    no_frames = f"Number of frames: {len(frames)}"
    print(no_frames)

    # Split data into training and test sets
    (train_sentences, test_sentences) = split_data_train_test(frames)

    # # For testing purpose adding all sentences to training and testing
    # sentences = []
    # for frame in frames:
    #     sentences.extend(frame.getSentences())
    # (train_sentences, test_sentences) = (sentences, sentences)

    no_sentences = f"Number of sentences: {len(train_sentences) + len(test_sentences)}"
    print(no_sentences)
    print(no_data_points_features)

    filter = copy.deepcopy(filter_)
    # Train models
    id_clf, label_clf, report_training = train_models_2(
        train_sentences, filter)

    print(f"{report_training}")
    if log_data:
        model_path = f"{directory}/models"
        if not os.path.isdir(model_path):
            # Create model folder
            try:
                os.mkdir(f"{model_path}")
            except:
                raise OSError(f"Unable to create directory {model_path}")
        # Save models
        save_model(id_clf, f"{parser_name}_identification_model",
                   f"{model_path}")
        save_model(label_clf, f"{parser_name}_labeling_model", f"{model_path}")

    filter = copy.deepcopy(filter_)
    # Test models
    (id_evaluation, label_evaluation,
     evaluation) = test_models_2(id_clf,
                                 label_clf,
                                 test_sentences,
                                 filter,
                                 prune_test_data=prune_test_data)
    print(f"Models tested")
    if log_data:
        # Save evaluation
        save_to_file(id_evaluation,
                     f"{directory}/{parser_name}_id_evaluation.txt")
        save_to_file(label_evaluation,
                     f"{directory}/{parser_name}_label_evaluation.txt")
        save_to_file(f"{evaluation}",
                     f"{directory}/{parser_name}_evaluation.txt")
        save_to_file(
            f"{no_frames}\n{no_sentences}\n{no_data_points_features}\n{report_training}",
            f"{directory}/run_description.txt")

    # Release memory using garbage collection
    del id_clf
    del label_clf
    gc.collect()
    return evaluation
Beispiel #9
0
def pipeline_2(directory: str,
               frames: List[Frame],
               parser_name: str,
               extract_features: set,
               filter_: dict,
               prune_test_data=True,
               log_data=False):

    filter = copy.deepcopy(filter_)

    # Prune sentences without an LU from the frames
    filter_faulty_sentences(frames)

    # Filter frames
    (frames, no_filtered_frames_sentences) = filter_data(frames, filter)
    print(f"Filtered data")

    # Add feature representation of each word to each word node
    (no_data_points_features) = create_feature_representation(
        frames, extract_features)
    print(f"Feature representation created")

    no_frames = f"Number of frames: {len(frames)}"
    print(no_frames)

    # Split data into training and test sets
    (train_sentences, test_sentences) = split_data_train_test(frames)

    no_sentences = f"Number of sentences: {len(train_sentences) + len(test_sentences)}"
    print(no_sentences)
    print(no_data_points_features)

    id_clf = open_model(f"{parser_name}_identification_model",
                        f"{directory}/models")
    label_clf = open_model(f"{parser_name}_labeling_model",
                           f"{directory}/models")

    filter = copy.deepcopy(filter_)

    # Test models
    (id_evaluation, label_evaluation,
     evaluation) = test_models_2(id_clf,
                                 label_clf,
                                 test_sentences,
                                 filter,
                                 prune_test_data=prune_test_data)
    print(f"Models tested")
    if log_data:
        # Save evaluation
        save_to_file(id_evaluation,
                     f"{directory}/{parser_name}_id_evaluation.txt")
        save_to_file(label_evaluation,
                     f"{directory}/{parser_name}_label_evaluation.txt")
        save_to_file(f"{evaluation}",
                     f"{directory}/{parser_name}_evaluation.txt")
        save_to_file(
            f"\n{no_frames}\n{no_sentences}\n{no_data_points_features}",
            f"{directory}/run_description.txt")

    # Release memory using garbage collection
    del id_clf
    del label_clf
    gc.collect()
    return evaluation
Beispiel #10
0
def main(run_args):

    if not run_args.skip:
        access_token = run_args.access_token

        if not run_args.file:
            print("\n# 0. Crawling Facebook...")
            if run_args.specific:
                specific = json.load(run_args.specific)
                print("Specific category to get :")
                print(specific)
                fb_crawlfile = crawl_facebook(access_token, run_args.count,
                                              run_args.limit, 10000, specific)
            else:
                fb_crawlfile = crawl_facebook(access_token, run_args.count,
                                              run_args.limit, 10000, [])
            pages = json.load(open(fb_crawlfile))
        else:
            pages = json.load(run_args.file)

        i = 0
        for page in pages:

            output = dict()
            i += 1

            print("\n# " + str(i) + ".1 Crawling page '" + page['name'] +
                  "'...")
            filename = "data/datasets/" + page['id']
            if not os.path.isfile(
                    filename +
                    ".json") or run_args.erase or os.path.getsize(filename +
                                                                  ".json") < 3:
                crawl.crawl_page(access_token,
                                 page['id'],
                                 200,
                                 run_args.limit,
                                 output_name=page['id'])
            else:
                print("Skipped (-e to not skip)")

            print("# " + str(i) + ".2 Filtering content of the page...")
            if not os.path.isfile(filename + "_filtered.json"
                                  ) or run_args.erase or os.path.getsize(
                                      filename + "_filtered.json") < 3:
                filter.filter_data(filename + ".json", False, 1, 1, 1)
            else:
                print("Skipped (-e to not skip)")

            print("# " + str(i) + ".3 Normalizing content of the page...")
            if not os.path.isfile(filename + "_filtered_normalized.json"
                                  ) or run_args.erase or os.path.getsize(
                                      filename +
                                      "_filtered_normalized.json") < 3:
                normalize.normalize_data(filename + "_filtered.json")
            else:
                print("Skipped (-e to not skip)")

            print("# " + str(i) + ".4 Counting main reactions of the page...")
            if not os.path.isfile(filename + "_filtered_normalized_count.json"
                                  ) or run_args.erase or os.path.getsize(
                                      filename +
                                      "_filtered_normalized_count.json") < 3:
                posts = None
                with open(filename + "_filtered_normalized.json",
                          'r') as infile:
                    posts = json.load(infile)
                    count = Counter()
                    for post in posts:
                        count.update([post['reaction']])

                # Add category & fan_count
                output['id'] = page['id']
                output['name'] = page['name']
                output['category'] = page['category']
                output['fan_count'] = page['fan_count']
                output['reaction'] = count

                with open(filename + "_filtered_normalized_count.json",
                          'w') as outfile:
                    json.dump(output, outfile)
            else:
                print("Skipped (-e to not skip)")

    cats = []
    reacts = []
    X = []
    Y = []
    for element in os.listdir('data/datasets'):
        if element.endswith('_count.json'):
            page = json.load(open('data/datasets/' + element))

            # Build dictionnary with main reactions
            if page['reaction']:
                max_key, max_value = max(page['reaction'].items(),
                                         key=lambda x: x[1])
                final_max = {max_key: max_value}
                for k, v in page['reaction'].items():
                    if v >= max_value * (1 - run_args.value):
                        final_max[k] = v

                # Print a test
                if page['category'] == "Author":
                    print(final_max)

                reacts.append(final_max)
                cats.append(page['category'])

    if run_args.nojoy:
        i_to_delete = []
        for i in range(0, len(cats)):

            if convert_reaction_to_bitmask(
                    reacts[i]) == REACTIONS_BITMASK['joy']:
                i_to_delete.append(i)

        i_to_delete.sort(reverse=True)
        for i in i_to_delete:
            cats.pop(i)
            reacts.pop(i)

    reactions = build_reaction_vector(reacts)
    categories = build_category_vector(cats)
    print(categories)
    for i in range(0, len(cats)):

        # Create esthetical offset
        if convert_reaction_to_bitmask(
                reacts[i]) == REACTIONS_BITMASK['joy'] or run_args.nojoy:
            offset = random.uniform(-0.1, 0.1)
        else:
            offset = 0
        X.append(categories[cats[i]] + offset)

        if convert_reaction_to_bitmask(
                reacts[i]) == REACTIONS_BITMASK['joy'] or run_args.nojoy:
            offset = random.uniform(-0.1, 0.1)
        else:
            offset = 0
        Y.append(reactions[convert_reaction_to_bitmask(reacts[i])] + offset)

    plt.figure()
    plt.plot(X, Y, '+')
    plt.title('Reactions by categories\nPages with min.10000 fans')
    plt.figtext(0.9,
                0.9,
                str(len(categories)) + 'categories\n' + str(len(reacts)) +
                'datas',
                fontsize=9,
                ha='right')
    bitmasks = []
    for bitmask in list(reactions.keys()):
        bitmasks.append(convert_bitmask_to_react(bitmask))
    plt.yticks(range(len(list(reactions.keys()))), bitmasks, rotation=0)
    plt.xticks(range(len(list(categories.keys()))),
               list(categories.keys()),
               rotation=80)
    plt.show()