コード例 #1
0
ファイル: validation.py プロジェクト: edelmoral/c45-python
    def train(self, domain, class_data):
        document = xml.dom.minidom.Document()
        node = document.createElement('Tree')
        document.appendChild(node)
        d = Trainer(domain, class_data, document)
        partial_atts = d.attributes
        partial_atts.remove("Id")
        partial_atts.remove("Vote")
        print partial_atts

        if len(self.restr) > 0:
            d.rem_restrictions(self.restr)

        d.c45(d.data, d.attributes, node, 0)

        self.classifier = Classifier()

        if len(class_data.category) > 0:
            self.classifier.has_category = True

        for row in d.data:
            self.classifier.classify(document.documentElement, row,
                                     class_data.attributes)

        self.classifier.print_stats()
コード例 #2
0
def evaluate_embeddings(embeddings):
    X, Y = read_node_label('../data/wiki/wiki_labels.txt')
    tr_frac = 0.8
    print("Training classifier using {:.2f}% nodes...".format(
        tr_frac * 100))
    clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, tr_frac)
コード例 #3
0
ファイル: main.py プロジェクト: willyptrain/MinCut-baseline
def classify(vectors, args):
    if not os.path.isfile(args.classifydir + '_labels.txt'):
        return defaultdict(lambda: 0)
    X, Y = read_node_label(args.classifydir + '_labels.txt')

    #     print("Training classifier using {:.2f}% nodes...".format(args.train_percent * 100))
    clf = Classifier(vectors=vectors,
                     clf=LogisticRegression(solver="lbfgs", max_iter=4000))
    #     scores = clf.split_train_evaluate(X, Y, args.train_percent)
    features, labels, graph, idx_train, idx_val, idx_test = load_dataset(
        str(args.classifydir.split("/")[-1]))
    #     print(idx_train)
    #     print(type(idx_train))
    idx_train = list(idx_train)

    #     idx_val = list(idx_val)
    #     idx_val += list(idx_test)[:600]

    idx_test = list(idx_test)  #[600:]

    #     for i in idx_val:
    #         idx_train.append(i)

    #     idx_val = idx_val[400:]

    print("TRAINING SIZE", len(idx_train), "VALIDATION SIZE", len(idx_val),
          "TESTING SIZE: ", len(list(idx_test)))
    scores = clf.split_train_evaluate_idx(X, Y, idx_train, idx_val)

    # scores = clf.split_train_evaluate(X, Y, args.train_percent)
    test_scores = clf.split_train_evaluate_idx(X, Y, idx_train, idx_test)
    test_x.append(test_scores['macro'])
    print("micro:", test_scores['micro'], "macro:", test_scores['macro'])

    return scores
コード例 #4
0
def main(args):
    t1 = time.time()
    g = Graph()
    print("Reading...")

    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)

    g.read_node_label(args.label_file)
    g.read_node_features(args.feature_file)
    model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb)
    t2 = time.time()
    print(t2 - t1)
    print("Saving embeddings...")
    model.save_embeddings(args.output)
    vectors = model.vectors
    X, Y = read_node_label(args.label_file)
    print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio *
                                                              100))
    clf = Classifier(vectors=vectors, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, args.clf_ratio)
コード例 #5
0
ファイル: run_digits.py プロジェクト: fedefliguer/AAI
def test_1NN(digitsdat, selected, all_test_m):

    for testm in all_test_m:

        classifier = Classifier()
        classifier.build_model(digitsdat.X[selected[0:testm], :], digitsdat.y[ selected[0:testm]])
        print("m=%d error=%f" % ( testm, classifier.classify(digitsdat.testX, digitsdat.testy)))
コード例 #6
0
ファイル: icbc.py プロジェクト: zhxgigi/toolkids
def classify_singlechar():
    singlechardir =  r'D:\projects\python\captchabreak\data\icbc\icbc1213_output'
    outputdir = r'D:\projects\python\captchabreak\data\icbc\icbc_c_output'
    try:
        os.mkdir(outputdir)
    except:
        pass
    
    traindata = r'D:\projects\python\captchabreak\data\icbc\icbcaudit'
    cc = Classifier(traindata, class_sample_num=20, size = 20)
    cc.train()
    count =0
    for subdir in cc.classify_map:
        try:
            os.mkdir(os.path.join(outputdir, subdir))
        except:
            pass
    for root, dirs, files in os.walk(singlechardir):
        for file in files:
            path = os.path.join(root, file)
            img = cv.LoadImage(path, 0)
            response = cc.classify_single_char(img)
            
            outputfile = os.path.join(outputdir, response, 'icbc_%d.jpg'%count)
            cv.SaveImage(outputfile, img)
            count += 1
コード例 #7
0
def node_classification(session, bs, seqne, sequences, seq_len, node_n, samp_idx, label, ratio):
    enc_sum_dict = {}
    node_cnt = {}
    s_idx, e_idx = 0, bs
    while e_idx < len(sequences):
        batch_enc = session.run(seqne.encoder_output,
                                feed_dict={seqne.input_seqs: sequences[s_idx: e_idx],
                                           seqne.dropout: 0, seqne.keep_prob: 0})
        enc_sum_dict, node_cnt = reduce_seq2seq_hidden_add(enc_sum_dict, node_cnt, sequences,
                                                       batch_enc.astype('float32'), seq_len, s_idx)

        s_idx, e_idx = e_idx, e_idx + bs

    if s_idx < len(sequences):
        batch_enc = session.run(seqne.encoder_output,
                                feed_dict={seqne.input_seqs: sequences[s_idx: len(sequences)],
                                            seqne.dropout: 0,
                                           seqne.keep_prob: 0})
        enc_sum_dict, node_cnt = reduce_seq2seq_hidden_add(enc_sum_dict, node_cnt, sequences,
                                                           batch_enc.astype('float32'), seq_len, s_idx)

    node_enc_mean = reduce_seq2seq_hidden_avg(sum_dict=enc_sum_dict, count_dict=node_cnt, node_num=node_n)
    lr = Classifier(vectors=node_enc_mean, clf=LogisticRegression())
    f1_micro, f1_macro = lr.split_train_evaluate(samp_idx, label, ratio)
    return f1_micro
コード例 #8
0
 def initialize(self, conf, ctx):
     self.counter = 0
     self.pid = os.getpid()
     self.total = 0
     self.classifier = Classifier()
     self.directory = str(os.getcwd()) + "/Tweet_Images"
     if not os.path.exists(self.directory):
         os.makedirs(self.directory)
コード例 #9
0
ファイル: repeat_classify.py プロジェクト: sctennis77/semeval
	def __init__(self,**kargs):
		Classifier.__init__(self,tweets=kargs["tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"],selection=kargs["selection"])
		self.mode = "ngram"
		self.include_word = True
		self.inclued_pos = True
		self.id="repeat{0},s:{1}".format(self.num_items,self.selection)

		self.repeat_dict = self.get_all_repeats()
		self.prepare_features()
コード例 #10
0
ファイル: main.py プロジェクト: chithangduong/commnet
def classify(vectors, args):
    if not os.path.isfile(args.classifydir + '_labels.txt'):
        return defaultdict(lambda: 0)
    X, Y = read_node_label(args.classifydir + '_labels.txt')
    print("Training classifier using {:.2f}% nodes...".format(
        args.train_percent * 100))
    clf = Classifier(vectors=vectors,
                     clf=LogisticRegression(solver="lbfgs", max_iter=4000))
    scores = clf.split_train_evaluate(X, Y, args.train_percent)
    return scores
コード例 #11
0
class TagCountBolt(Bolt):
    outputs = ['cls', 'tag', 'date', 'hour']

    def initialize(self, conf, ctx):
        self.counter = 0
        self.pid = os.getpid()
        self.total = 0
        self.classifier = Classifier()
        self.directory = str(os.getcwd()) + "/Tweet_Images"
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
            #self.logger.info("------CREATED FOLDER--------")

    def _increment(self, word, inc_by):
        self.counter[word] += inc_by
        self.total += inc_by

    def process(self, tup):
        data = json.loads(tup.values[0].encode('utf-8'))
        self.logger.info(data)
        if 'img_url' in data:
            path = "{}/{}.jpg".format(self.directory, self.counter)
            try:
                urllib.urlretrieve(data['img_url'], path)
                self.counter = self.counter + 1
                self.classifier.load_image(path)
                predicted_class = self.classifier.classify()
                #self.logger.info("\n [INFO_BOLT_PREDICTION] : "+ " ".join(predicted_class))
                if len(data['hash']) > 0:
                    tags = [
                        str(li['text']) for li in data['hash']
                        if li['text'][0:1] != "\\"
                    ]
                    #self.logger.info("\n [INFO_BOLT_TAGS] : "+ " ".join(tags))

                    now = datetime.datetime.now()
                    now_date = "{:04}-{:02}-{:02}".format(
                        now.year, now.month, now.day)
                    for cls in predicted_class:
                        if len(tags) > 0:
                            for tag in tags:
                                self.emit([cls, tag, now_date, str(now.hour)])
                                #self.logger.info("{0}/{1}".format(cls,tag))

                os.remove(path)

            except (KeyboardInterrupt, Exception):
                self.logger.info(Exception)

        else:
            self.logger.info("NO IMG URL!!!")
            #self.logger.info(json.dumps(data))

        if self.counter % 10 == 0:
            self.logger.info("Processed [{:,}] tweets".format(self.counter))
コード例 #12
0
	def __init__(self, **kargs):
		Classifier.__init__(self,tagged_tweets=kargs["tagged_tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"])
		self.mode = kargs["mode"]
		self.inclued_pos = kargs["pos"]
		self.include_word = kargs["word"]
		self.mode= kargs["mode"]
		self.id="context{0},m:{1},w:{2},t:{3}".format(self.num_items,self.mode,self.include_word,self.inclued_pos)
		self.debug=False
		self.ranked_ngrams = kargs["ranked_ngrams"]
		self.num_ngrams = len(self.ranked_ngrams)
		self.context_dict = self.build_ct_dict() 
		self.prepare_features()
コード例 #13
0
def test_1NN(digitsdat, selected, all_test_m):
    for testm in all_test_m:
        classifier = Classifier()
        # model = build(digitsdat.X[selected[0:testm], :], digitsdat.y[selected[0:testm]])
        classifier.build_model(digitsdat.X[selected[0:testm], :],
                               digitsdat.y[selected[0:testm]])
        error = classifier.classify(digitsdat.testX, digitsdat.testy)
        # accuracy = res(model)
        # print("m=%d error=%f" % (testm, 100-accuracy))
        print("m=%d error=%f" % (testm, error))
        # global M, e
        M.append(testm)
        e.append(error)
コード例 #14
0
def main(args):
    node_embeddings = load_embeddings(args.embedding_file)
    if args.label_file:
        labels = read_node_label(args.label_file)

    if args.modularity:
        print("Modularity")
        modularity(args, node_embeddings, args.min_k, args.max_k)

    if args.reconstruction:
        print("Graph reconstruction")
        reconstr(args, node_embeddings, args.k_nbrs)

    if args.clustering:
        print("Clustering")
        clustering(node_embeddings, labels, args.exp_times)

    if args.link_prediction:
        print("Link prediction")
        link_prediction(args.input, node_embeddings)

    if args.classification:
        X = list(labels.keys())
        Y = list(labels.values())
        print("Node classification")
        clf_ratio_list = args.clf_ratio.strip().split(',')
        result_list = {}
        train_ratio = np.asarray(range(1, 10)) * .1
        for clf_ratio in train_ratio:  # clf_ratio_list:
            result_per_test = []
            for ti in range(args.exp_times):
                clf = Classifier(vectors=node_embeddings, clf=LogisticRegression())
                myresult = clf.split_train_evaluate(X, Y, float(clf_ratio))
                result_per_test.append(myresult)
            result_list[clf_ratio] = result_per_test

        print('-------------------')
        for clf_ratio in train_ratio:
            print('Train percent:', clf_ratio)
            results = result_list[clf_ratio]
            for index, result in enumerate(results):
                print('Shuffle #%d:   ' % (index + 1), result)

            avg_score = defaultdict(float)
            for score_dict in results:
                for metric, score in score_dict.items():
                    avg_score[metric] += score
            for metric in avg_score:
                avg_score[metric] /= len(results)
            print('Average score:', dict(avg_score))
            print('-------------------')
コード例 #15
0
ファイル: ngram_classify.py プロジェクト: sctennis77/semeval
	def __init__(self, **kargs):
		Classifier.__init__(self,tweets=kargs["tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"],selection=kargs["selection"])
		self.mode = kargs["mode"]
		self.inclued_pos = kargs["pos"]
		self.include_word = kargs["word"]
		self.mode= kargs["mode"]
		self.rank = kargs['rank'] #int(self.num_ngrams *self.keep_features)
		self.id="{1}{0},w:{2},t:{3},s:{4},r:{5}".format(self.num_items,self.mode,self.include_word,self.inclued_pos,self.selection,self.rank)
		self.debug=False
		self.ngram_dict = self.get_ranked_ngrams()
		self.ranked_ngrams = sorted(self.ngram_dict,key = lambda x: self.ngram_dict[x],reverse=True)
		self.ngram_dict = {}
		self.num_ngrams = len(self.ranked_ngrams)
		self.prepare_features()
コード例 #16
0
ファイル: online_task.py プロジェクト: zhuxiang/MB_TREC2015
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()
コード例 #17
0
def node_classification( embeddings, label_path, name, size):

        X, Y = read_node_label( embeddings, label_path,)

        f_c=open('results/%s_classification_%d.txt'%(name, size), 'w')

        all_ratio=[]

        for tr_frac in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:

               print(" Training classifier using {:.2f}% nodes...".format(tr_frac * 100))
               clf = Classifier(embeddings=embeddings, clf=LogisticRegression(), name=name)
               results= clf.split_train_evaluate(X, Y, tr_frac)

               avg='macro'
               f_c.write(name+' train percentage: '+ str(tr_frac)+ ' F1-'+avg+ ' '+ str('%0.5f'%results[avg]))
               all_ratio.append(results[avg])
               f_c.write('\n')
コード例 #18
0
def prepare_data(filename):
    """
    Parse and classify text from file with given filename to prepare data to
    generate the report.
    """
    parser = Parser()
    classifier = Classifier()

    class_duration = {    # Dictionary for duration of each class
        'W': timedelta(hours=0, minutes=0),
        'R': timedelta(hours=0, minutes=0),
        'N': timedelta(hours=0, minutes=0),
    }
    action_durations = {} # Dictionary for duration of each action

    activities = parser.parse_file(filename)
    for activity in activities:
        duration = activity['duration']
        actions = activity['actions']

        divided_duration = duration / len(actions)
        for action in actions:
            classification = classifier.classify_action(action)
            class_duration[classification] += divided_duration
            if action in action_durations:
                action_durations[action] += divided_duration
            else:
                action_durations[action] = divided_duration

    sorted_action_durations = sorted(action_durations.items(),
                                     key=lambda tup: tup[1], reverse=True)

    # Add structure to data and return
    plot_data = {
        'summary_pie_chart': {
            'class_duration': class_duration,
        },
    }
    report_data = {
        'sorted_action_durations': sorted_action_durations,
        'class_duration': class_duration,
        'timedelta_to_string': timedelta_to_string,
    }
    return plot_data, report_data
コード例 #19
0
def evaluate_embeddings_with_split(vectors,
                                   X_train,
                                   Y_train,
                                   X_test,
                                   Y_test,
                                   Y_all,
                                   testnum=10):
    print("Training classifier with the pre-defined split setting...")
    #clf = Classifier(vectors=vectors, clf=LogisticRegression())
    clf = Classifier(vectors=vectors, clf=SVC(probability=True))
    micro_list = []
    macro_list = []
    for i in range(testnum):
        res = clf.evaluate_with_fixed_split(X_train, Y_train, X_test, Y_test,
                                            Y_all)

        micro_list.append(res['micro'])
        macro_list.append(res['macro'])
    return mean(micro_list), mean(macro_list)
コード例 #20
0
ファイル: line.py プロジェクト: yyr93520/sne
    def __init__(self,
                 graph,
                 lr=.001,
                 rep_size=128,
                 batch_size=1000,
                 epoch=10,
                 negative_ratio=5,
                 order=3,
                 label_file=None,
                 clf_ratio=0.5,
                 auto_stop=True):
        self.rep_size = rep_size
        self.order = order
        self.best_result = 0
        self.vectors = {}
        self.model = _LINE(graph,
                           lr,
                           rep_size,
                           batch_size,
                           negative_ratio,
                           order=self.order)
        for i in range(epoch):
            self.model.train_one_epoch()
            if label_file:
                self.get_embeddings()
                X, Y = read_node_label(label_file)
                print "Training classifier using {:.2f}% nodes...".format(
                    clf_ratio * 100)
                clf = Classifier(vectors=self.vectors,
                                 clf=LogisticRegression())
                result = clf.split_train_evaluate(X, Y, clf_ratio)

                if result['micro'] < self.best_result and auto_stop:
                    self.vectors = self.last_vectors
                    print 'Auto stop!'
                    return
                elif result['micro'] > self.best_result:
                    self.best_result = result['micro']
        self.get_embeddings()
コード例 #21
0
ファイル: douban.py プロジェクト: zhxgigi/toolkids
def main():
    traindata = r'D:\projects\python\captchabreak\data\doubanaudit'
    douban = Classifier(traindata, 5)
    douban.train()
    
    testdata = r'D:\projects\python\captchabreak\data\doubanseg'
    outputdir = r'D:\projects\python\captchabreak\data\doubanoutput'
    count =0
    for subdir in douban.classify_map:
        try:
            os.mkdir(os.path.join(outputdir, subdir))
        except:
            pass
    for root, dirs, files in os.walk(testdata):
        for file in files:
            path = os.path.join(root, file)
            img = cv.LoadImage(path, 0)
            response = douban.classify_single_char(img)
            
            outputfile = os.path.join(outputdir, response, 'douban_%d.jpg'%count)
            cv.SaveImage(outputfile, img)
            count += 1
コード例 #22
0
def main(args):
    print("xnetmf", "begin...")
    t1 = time.time()
    print("Reading...")
    nx_graph = nx.read_edgelist(agrs.input, nodetype=int, comments="%")
    adj_matrix = nx.adjacency_matrix(nx_graph).todense()
    print(adj_matrix)
    g = Graph(adj_matrix)
    rep_method = RepMethod(
        max_layer=2
    )  # Learn representations with xNetMF.  Can adjust parameters (e.g. as in REGAL)
    representations = src.xnetmf.get_representations(g, rep_method)
    print(representations)
    print(representations.shape)
    print("TAWD", "begin...")
    print("Reading...")
    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)

    g.read_node_label(args.label_file)
    g.read_node_features(args.feature_file)
    model = xtadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb)
    t2 = time.time()
    print(t2 - t1)
    print("Saving embeddings...")
    model.save_embeddings(args.output)
    vectors = model.vectors
    X, Y = read_node_label(args.label_file)
    print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio *
                                                              100))
    clf = Classifier(vectors=vectors, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, args.clf_ratio)
コード例 #23
0
ファイル: online_task.py プロジェクト: zhuxiang/MB_TREC2015
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]    # 当天相关推文记录,存储共离线分析
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()
コード例 #24
0
def main(argv):
    # Constants for the analyzer and the classifier
    dataset = 'commit_comments-dump.2015-01-29.json'
    group = 'id'
    model_file = 'model.pickle'

    # Create the analyzer
    analyzer = Analyzer(group)

    # Create the classifier
    algorithm_class = RandomForestRegressor
    algorithm_parameters = {
        'n_estimators': 100,
        'n_jobs': 2,
        'min_samples_split': 10
    }
    classifier = Classifier(group, model_file)
    classifier.create_model(train=True,
                            class_name=algorithm_class,
                            parameters=algorithm_parameters)

    # Compare analyzer output with classifier output and identify differences
    unrecognized_negative = {}
    unrecognized_positive = {}
    predictions = classifier.predict()
    line = 0  # Dataset line
    i = 0  # Prediction ID (+1)
    file = open(dataset, 'rb')
    for data in Utilities.read_json(file, 'id', group):
        line = line + 1
        if line % 1000 == 0:
            print(line)
        if not classifier.filter(data):
            continue
        i = i + 1

        message = data['message']
        score = analyzer.analyze(message)[0]
        if score == 0:
            continue

        diff = predictions[i - 1] - score
        if abs(diff) < 1.0:
            continue

        target = unrecognized_negative if diff < 0 else unrecognized_positive
        target[line] = diff

    result = sorted(unrecognized_positive.items(), key=lambda x: x[1])
    for item in result:
        print("{}: {}: {}".format(item[0], item[1],
                                  linecache.getline(dataset, item[0])[:-1]))
コード例 #25
0
ファイル: validation.py プロジェクト: edelmoral/c45-python
    def train(self, domain, class_data):
        document = xml.dom.minidom.Document()
        node = document.createElement('Tree')
        document.appendChild(node)
        d = Trainer(domain, class_data, document)
        partial_atts = d.attributes
        partial_atts.remove("Id")
        partial_atts.remove("Vote")
        print partial_atts
      
        if len(self.restr) > 0:
            d.rem_restrictions(self.restr)

        d.c45(d.data, d.attributes, node, 0)

        self.classifier = Classifier()

        if len(class_data.category) > 0:
            self.classifier.has_category = True

        for row in d.data:
            self.classifier.classify(document.documentElement, row, class_data.attributes)
            
        self.classifier.print_stats()
コード例 #26
0
    def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file = None, clf_ratio = 0.5, auto_save = True):
        self.rep_size = rep_size
        self.order = order
        self.best_result = 0
        self.vectors = {}
        if order == 3:
            self.model1 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=1)
            self.model2 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=2)
            for i in range(epoch):
                self.model1.train_one_epoch()
                self.model2.train_one_epoch()
                if label_file:
                    self.get_embeddings()
                    X, Y = read_node_label(label_file)
                    print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100)
                    clf = Classifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)

                    if result['macro'] > self.best_result:
                        self.best_result = result['macro']
                        if auto_save:
                            self.best_vector = self.vectors

        else:
            self.model = _LINE(graph, rep_size, batch_size, negative_ratio, order=self.order)
            for i in range(epoch):
                self.model.train_one_epoch()
                if label_file:
                    self.get_embeddings()
                    X, Y = read_node_label(label_file)
                    print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100)
                    clf = Classifier(vectors=self.vectors, clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)

                    if result['macro'] > self.best_result:
                        self.best_result = result['macro']
                        if auto_save:
                            self.best_vector = self.vectors

        self.get_embeddings()
        if auto_save and label_file:
            self.vectors = self.best_vector
コード例 #27
0
def main(argv):
    folds = int(argv[0]) if len(argv) > 0 else 5
    filter = argv[1].lower() if len(argv) > 1 else ""

    # Fields to check whether the filter, if given, appears in.
    filter_fields = ['name', 'class_name', 'module']

    # Read the manifest containing algorithm descriptions.
    with open('algorithms.json', 'r') as manifest:
        algorithms = json.load(manifest)

    # Load previous results
    try:
        with open('experiment_results.json', 'r') as file:
            results = json.load(file)
    except:
        results = {}

    for algorithm in algorithms:
        # Skip running the algorithm if it is disabled or the filter name does
        # not appear in any of the fields.
        if 'disabled' in algorithm and algorithm['disabled']:
            continue
        if filter and all(
            [filter not in algorithm[k].lower() for k in filter_fields]):
            continue

        # Convert manifest entries to classifier class and parameters
        class_name = Utilities.get_class(algorithm['module'],
                                         algorithm['class_name'])
        dense = algorithm['dense'] if 'dense' in algorithm else False

        # Create all possible combinations of parameters.
        parameter_combinations = itertools.product(
            *algorithm['parameters'].values())

        single_parameters = [
            param for param, values in algorithm['parameters'].iteritems()
            if len(values) == 1
        ]
        string_parameters = [
            param for param, values in algorithm['parameters'].iteritems()
            if isinstance(values[0], (str, unicode))
        ]
        for combination in parameter_combinations:
            classifier = Classifier('id')

            # Turn the selected parameter combination back into a dictionary
            parameters = dict(zip(algorithm['parameters'].keys(), combination))

            # Create the model according to the parameters
            classifier.create_model(train=False,
                                    class_name=class_name,
                                    parameters=parameters,
                                    dense=dense)

            Utilities.print_algorithm(algorithm['name'], parameters)
            parameter_string = Utilities.get_parameter_string(
                parameters, single_parameters + string_parameters)

            # Run cross-validation and print results
            result = classifier.output_cross_validate(folds)
            print('')

            name = algorithm['name']
            for param in string_parameters:
                name += ", %s=%s" % (param, parameters[param])

            # Write the result measurements into the results dictionary.
            if name not in results:
                results[name] = OrderedDict()

            results[name].update({
                parameter_string: {
                    'average': result.mean(),
                    'standard_deviation': result.std()
                }
            })

            # Write intermediate results (back) into a pretty-printed JSON file
            with open('experiment_results.json', 'w') as file:
                json.dump(results, file, indent=4, separators=(',', ': '))
コード例 #28
0
import io
import numpy as np

from torch import nn
import torch.utils.model_zoo as model_zoo
import torch.onnx
import onnxruntime
import cv2

from classify import Classifier, get_default_args

if __name__ == '__main__':
    cfg = get_default_args()

    classify = Classifier(cfg)
    torch_model = classify.model
    torch_model.to(torch.device('cpu'))

    x = torch.randn(1, 3, 224, 224)


    frame = cv2.imread('/home/cmf/datasets/spoof/NUAADetectedface/Detectedface/ImposterFace/0001/0001_00_00_01_0.jpg')
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    img = cv2.resize(img, (224, 224))
    # img = np.float32(img)
    img = img.astype(np.float32)
    # print(img.max(), img.min())
    img = img / 255
    # print(img.max(), img.min())
    img = np.transpose(img, (2, 0, 1))
コード例 #29
0
ファイル: demo.py プロジェクト: WangWenjun559/Weiss
"""
This is a demo about how to use LibLINEAR to do the prediction
"""
from classify import Classifier

# Create an object first, use this object all the time
test = Classifier()
# When a query comes, Example query
query = "Can we talk about Terminator"
# Do the prediction and get information
plausible = set([5,7,8])
p_label = test.action_info(query,plausible)
print p_label
コード例 #30
0
ファイル: postag_classify.py プロジェクト: sctennis77/semeval
	def __init__(self,**kargs):
		Classifier.__init__(self,tweets=kargs["tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"],selection=kargs["selection"])
		self.id="tagcount{0},s:{1}".format(self.num_items,self.selection)
コード例 #31
0
ファイル: online_task.py プロジェクト: zhuxiang/MB_TREC2015
class Controller(threading.Thread):
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()

    def load_ranker(self):
        self.logger_info.info('loading ranker...')
        gateway = JavaGateway()
        ranker = gateway.entry_point
        self.logger_info.info('ranker loaded!')
        return ranker

    def run(self):
        self.logger_info.info('%s is starting...' % self.name)
        self.threadEvent.wait()
        self.logger_info.info('%s is running...' % self.name)
        self.schedule.enter(0, 0, self.dump_schedule, ())
        self.schedule.run()
        self.process()

    def process(self):
        start_day = time.gmtime(time.time()).tm_mday
        for line in sys.stdin:
            tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line)
            simhash_value = Simhash(tweet_text).value
            if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                continue

            topic_id, similarity = self.classifier.classify(tweet_text)
            if topic_id == '':
                continue

            tweet_json['similarity'] = similarity
            evaluate_score = self.ranker.predict(json.dumps(tweet_json))
            total_score = similarity * evaluate_score
            if total_score < 0.15:
                continue

            is_pushed = self.pusher.push(evaluate_score, topic_id)
            if is_pushed:
                delivery_time = time.time()
                self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text])
                self.pushed_tweets_ids.add(tid_retweet)

            struct_time = time.gmtime(float(timestamp[:-3]))
            utc_time = time.strftime('%Y%m%d', struct_time)
            self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tweet_text])
            self.related_tweets_hash.add(simhash_value)

            if struct_time.tm_mday != start_day:
                self.dump_result(start_day)
                start_day = struct_time.tm_mday

    def dump_result(self, file_name):
        self.logger_info.info('saving result...')
        with open('submit/task-b/'+file_name, 'w') as fw:
            for index, records in enumerate(self.related_tweets):
                pid = str(index+226)
                sorted_records = sorted(records, key=lambda item: -item[2])
                for rank, record in enumerate(sorted_records):
                    if rank >= 100:
                        break
                    fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA'))
        with open('submit/task-a/'+file_name, 'w') as fw:
            with open('submit/task-a_extr/'+file_name, 'w') as fw_extr:
                for index, records in enumerate(self.pushed_tweets):
                    pid = str(index+226)
                    for record in records:
                        fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1]))
                        fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4]))

        self.related_tweets = [[] for _ in range(225)]
        self.pushed_tweets = [[] for _ in range(225)]
コード例 #32
0
if len(sys.argv) > 3 or len(sys.argv) < 2:
    print("Please Enter valid parameter:")
    print("Parameter: Congress term number")
    print("Option: --skip, avoid data cleaning")
    sys.exit()

if len(sys.argv) == 2:
    congress_id = str(sys.argv[1])
    if os.path.isfile("rawData/" + "speeches_" + congress_id +
                      ".txt") and os.path.isfile("rawData/" + congress_id +
                                                 "_SpeakerMap.txt"):
        print("cleaning ....")
        data_cleaner = Cleaner([congress_id])
        data_cleaner.clean_pipeline()
        print("classifying ....")
        congress_classifier = Classifier([congress_id])
        congress_classifier.base_pipeline()
        print("done.")
        sys.exit()
    else:
        print(
            "There are no speeches and speakerMap text file to process for congress "
            + congress_id)
        print(
            "Please put the target congress raw text data into rawData directory"
        )
        sys.exit()

if len(sys.argv) == 3 and (sys.argv[1] == "--skip"
                           ):  # skip data cleaning (data is already cleaned)
    congress_id = str(sys.argv[2])
コード例 #33
0
ファイル: lastic.py プロジェクト: kisate/llastic
import pickle
from mailbox import Client, MessageHandler
from colorfind import colorfind, loadScalars, connComps
from sys import argv
from coords import avg, mul, toInt, split, diff, from_cv2_to_mm_centered
from classify import Classifier

cap = cv2.VideoCapture(1)
cap2 = cv2.VideoCapture(2)
host = '192.168.43.83'
port = 6164  # fix number
msg = [0]
n = 0
stop_event = threading.Event()
# cap.set(cv2.CAP_PROP_POS_MSEC, 15000)
nn = Classifier()


def nothing(x):
    return x


def sign(x):
    if x > 0:
        return 1.
    elif x < 0:
        return -1.
    elif x == 0:
        return 0.
    else:
        return x
コード例 #34
0
ファイル: compare_friends.py プロジェクト: mbecker1999/ezxkcd
        break
    elif file_mode == 'w+':
        friends_file = open(complete_name, 'w+')
        print("enter the names of your FB contacts [Ctrl-D to save it]: ")
        while True:
            try:
                line = input()
            except EOFError:
                break
            friends_file.write('%r\n' %line)
        break
    else:
        print('incorrect input, please try again\n')

friends_file.seek(0)
c = Classifier()

from_lengths = {}
to_lengths = {}
from_sentiments = {}
to_sentiments = {}

friends = []

for l in friends_file:
    a,b = l.split(" ")
    friend_name = a + ' ' + b
    friends.append(friend_name)

for friend_name in friends:
    print(friend_name)
コード例 #35
0
ファイル: validation.py プロジェクト: edelmoral/c45-python
class Validator:
    def __init__(self, restrictions):
        self.attributes = [];
        self.true_pos = 0
        self.true_neg = 0
        self.false_pos = 0
        self.false_neg = 0
        
        if len(restrictions) > 0:
            self.restr = restrictions.restr
        else: 
            self.restr = restrictions

    def train(self, domain, class_data):
        document = xml.dom.minidom.Document()
        node = document.createElement('Tree')
        document.appendChild(node)
        d = Trainer(domain, class_data, document)
        partial_atts = d.attributes
        partial_atts.remove("Id")
        partial_atts.remove("Vote")
        print partial_atts
      
        if len(self.restr) > 0:
            d.rem_restrictions(self.restr)

        d.c45(d.data, d.attributes, node, 0)

        self.classifier = Classifier()

        if len(class_data.category) > 0:
            self.classifier.has_category = True

        for row in d.data:
            self.classifier.classify(document.documentElement, row, class_data.attributes)
            
        self.classifier.print_stats()

    #def print_stats(self):


    def recall(self):
        TP = self.classifier.true_pos
        FN = self.classifier.false_neg

        return float(TP) / (TP + FN)

    def precision(self):
        TP = self.classifier.true_pos
        FP = self.classifier.false_pos

        return float(TP) / (TP + FP)

    def pf(self):
        TN = self.classifier.true_neg
        FP = self.classifier.false_pos

        return float(FP) / (FP + TN)

    def fmeasure(self):
        beta = 2 
        return float(beta * self.precision() * self.recall()) / (self.precision() + self.recall())

    def confusion_matrix(self):
        print "###### CONFUSION MATRIX #######"
        print "                | Classified Positive | Classified Negative |"
        print "Actual Positive |          " + self.classifier.true_pos + "           |          " + self.classifier.false_neg + "           |"
        print "Actual Negative |          " + self.classifier.false_pos + "           |           " + self.classifier.true_neg + "          |"
コード例 #36
0
ファイル: main.py プロジェクト: esraabil/NECL
def main(args):
    print("number-walks " + str(args.number_walks))
    print("representation-size " + str(args.representation_size))
    print("walk-length " + str(args.walk_length))
    print("inout_fle " + str(args.input))
    print("******")
    g = Graph()
    deepw = False
    #similarity thresholds for compration
    trsl = [0.45, 0.495, 0.5, 0.55, 0.6, 0.7, 0.8, 1]
    #    trsl=[ 0.5 ]
    learn = True
    X, Y = read_node_label(args.label_file)
    seed = 0
    clsratio = [0.01, 0.05, 0.07, 0.1, 0.25, 0.5, 0.7, 0.8]
    #    clsratio=[ 0.1,0.2,0.4, 0.6,0.7,0.8,0.9]#,0.7,0.8]# use for blogcatalog

    np.random.seed(seed)
    shuffle_indices = np.random.permutation(np.arange(len(X)))
    f = open(args.input + "shu.txt", "w")
    f.writelines(str(item) + "\n" for item in shuffle_indices)
    f.close()
    if args.graph_format == 'adjlist':
        g.read_adjlist(filename=args.input, directed=args.directed)
    elif args.graph_format == 'edgelist':
        g.read_edgelist(filename=args.input,
                        weighted=args.weighted,
                        directed=args.directed)
    G = g.G
    print("before spar, n: " + str(len(G.nodes())) + " m: " +
          str(len(G.edges())))
    #compute similarity score for compression
    t1 = time.time()
    p = pC(G, 0.45)
    scoreNode = p.ScoreCompute()
    t3 = time.time()
    f = open(args.input + "score.txt", "w")
    f.writelines(
        str(n[0]) + " " + str(n[1]) + " " + str(scoreNode[n]) + "\n"
        for n in scoreNode)
    f.close()
    print("total scorecom time: " + str(t3 - t1))
    #   read similarity scores from file
    #    f=open(args.input+"score.txt","r")
    #    scoreNode=dict()
    #    for x in f:
    #        l=x.split()
    #        scoreNode[((l[0]),(l[1]))] = float(l[2])

    for kk in range(0, len(trsl)):
        if learn:  # do embeding
            ths = trsl[kk]  #args.trs
            print("threshold is ...", ths)
            if args.graph_format == 'adjlist':
                g.read_adjlist(filename=args.input, directed=args.directed)
            elif args.graph_format == 'edgelist':
                g.read_edgelist(filename=args.input,
                                weighted=args.weighted,
                                directed=args.directed)
            if ths != 1:  #compression
                t1 = time.time()
                G = g.G
                G, nl2 = makeCompression(G, scoreNode, ths)
                f = open(args.input + "af_spar.txt", "w")
                f.writelines(str(n) + " " + str(nl2[n]) + "\n" for n in nl2)
                f.close()
                writeg(G, args)
                t2 = time.time()
                print("total_sparc_time: " + str(t2 - t1))
            #embedding
            t1 = time.time()
            print("After_compresing,n,m " + str(len(g.G.nodes())) + " " +
                  str(len(g.G.edges())))
            model = node2vec.Node2vec(graph=g,
                                      path_length=args.walk_length,
                                      num_paths=args.number_walks,
                                      dim=args.representation_size,
                                      workers=args.workers,
                                      p=args.p,
                                      q=args.q,
                                      window=args.window_size,
                                      dw=deepw)
            t2 = time.time()
            print("total_embeding_time " + str(t2 - t1))
            vectors = model.vectors
            if ths != 1:  #add embedding of removed nodes in compression
                addBack(nl2, vectors)
            np.save(args.output + "_" + str(ths) + ".npy", vectors)
        else:  #load embeddings
            vectors = np.load(args.output + "_" + str(ths) + ".npy")
            vectors = vectors.item(0)
            print("file_loaded")

    #print("Training classifier")
    #split_train_evaluate2 for single label (cora and wiki)
    #split_train_evaluate for multi lable (dblp and blogcatalog)
        for r in clsratio:
            clfa = Classifier(vectors,
                              clf=LogisticRegression(solver='liblinear'))
            res = clfa.split_train_evaluate2(
                X, Y, r, shuffle_indices)  # args.clf_ratio)
            print(str(r) + " " + str(res["macro"]) + " " + str(res["micro"]))
コード例 #37
0
def main(argv):
    folds = int(argv[0]) if len(argv) > 0 else 5
    filter = argv[1].lower() if len(argv) > 1 else ""

    # Fields to check whether the filter, if given, appears in.
    filter_fields = ['name', 'class_name', 'module']

    # Read the manifest containing algorithm descriptions.
    with open('algorithms.json', 'r') as manifest:
        algorithms = json.load(manifest)

    # Load previous results
    try:
        with open('experiment_results.json', 'r') as file:
            results = json.load(file)
    except:
        results = {}

    for algorithm in algorithms:
        # Skip running the algorithm if it is disabled or the filter name does 
        # not appear in any of the fields.
        if 'disabled' in algorithm and algorithm['disabled']:
            continue
        if filter and all([filter not in algorithm[k].lower() for k in filter_fields]):
            continue

        # Convert manifest entries to classifier class and parameters
        class_name = Utilities.get_class(algorithm['module'], algorithm['class_name'])
        dense = algorithm['dense'] if 'dense' in algorithm else False

        # Create all possible combinations of parameters.
        parameter_combinations = itertools.product(*algorithm['parameters'].values())

        single_parameters = [param for param,values in algorithm['parameters'].iteritems() if len(values) == 1]
        string_parameters = [param for param,values in algorithm['parameters'].iteritems() if isinstance(values[0],(str,unicode))]
        for combination in parameter_combinations:
            classifier = Classifier('id')

            # Turn the selected parameter combination back into a dictionary
            parameters = dict(zip(algorithm['parameters'].keys(), combination))

            # Create the model according to the parameters
            classifier.create_model(train=False, class_name=class_name, parameters=parameters, dense=dense)

            Utilities.print_algorithm(algorithm['name'], parameters)
            parameter_string = Utilities.get_parameter_string(parameters, single_parameters + string_parameters)

            # Run cross-validation and print results
            result = classifier.output_cross_validate(folds)
            print('')

            name = algorithm['name']
            for param in string_parameters:
                name += ", %s=%s" % (param,parameters[param])

            # Write the result measurements into the results dictionary.
            if name not in results:
                results[name] = OrderedDict()
            
            results[name].update({
                parameter_string: {
                    'average': result.mean(),
                    'standard_deviation': result.std()
                }
            })

            # Write intermediate results (back) into a pretty-printed JSON file
            with open('experiment_results.json', 'w') as file:
                json.dump(results, file, indent=4, separators=(',', ': '))
コード例 #38
0
from bottle import (route, run, template, request, redirect)
from parse import get_news, extract_next_page
from db import News, session
from scripts import save
from classify import Classifier

s = session()
classifier = Classifier()
mark_news = s.query(News).filter(News.label != None).all()
x_title = [row.title for row in mark_news]
y_lable = [row.label for row in mark_news]
classifier.fit(x_title, y_lable)
コード例 #39
0
    subj = ""
    for line in text.splitlines():
        if line.startswith("subject:"):
            is_subj = True
            subj = line[8:]
        else:
            lines.append(line)
    return email_extract(subj, " ".join(lines), min_len, max_len)


def test_enron_files(classifier, path, label, selector=None):
    files = get_file_list(path, selector)
    correct = total = 0
    for filename in files:
        with open(os.path.join(path, filename)) as fh:
            contents = fh.read()
        features = enron_email_extract(contents)
        res = classifier.classify(features)
        best = res[0][0]
        if best == label:
            correct += 1
        total += 1
    pct = 100 * (float(correct) / total)
    print 'Accuracy of "%s": %s%% based on %s documents' % (label, pct, total)


if __name__ == "__main__":
    classifier = Classifier.load("classifier.bin")
    for d, l in get_dir_and_labels():
        test_enron_files(classifier, d, l)
コード例 #40
0
ファイル: enron.py プロジェクト: kaushikmit/mlscripts
def load_classifier():
    if not os.path.exists('classifier.db'):
        sys.stderr.write('Unable to load classifier.db -- please run this script first')
        sys.exit(1)
    return Classifier.load('classifier.db')
コード例 #41
0
ファイル: links.py プロジェクト: aimeeble/links
        "link_type": surl.get_link_type_text(),
        "mime_type": surl.get_mime_type(),
        "short_url": surl.get_short_url(),
        "long_url": surl.get_long_url(),
        "short_code": surl.get_short_code(),
        "referrers": {},  # {ref->count}
        "locations": {},  # {IP->count}
        "hits": [],
        "social": surl.get_social(),
        "thumb": thumb,
    }

    # collect the stats
    itr = sdb.list_hits(surl.get_short_code())
    for hit in itr:
        classifier = Classifier(sdb, surl, hit)
        # Referrers
        ref = hit["referrer"]
        if ref not in params["referrers"]:
            params["referrers"][ref] = 0
        params["referrers"][ref] += 1

        hit["bot"] = classifier.is_bot()

        hit["time"] = time.strftime("%Y-%m-%d %H:%M:%S",
                                    time.localtime(hit["time"]))
        hit["reltime"] = "2h"

        hit["cc"] = classifier.get_country_code('??')
        hit["area"] = classifier.get_region('??')
コード例 #42
0
ファイル: bayes.py プロジェクト: noeatnosleep/MrRogersbot
def load_classifier():
    if not os.path.exists('classifier.db'):
        classifier = train_all(None , corpus)
        classifier.save('classifier.db')
    return Classifier.load('classifier.db')
コード例 #43
0
ファイル: Main.py プロジェクト: setavenger/HomeBot

#
# main
#

rec.start_recording()

# initialise the tts speech engine
engine = pyttsx3.init()

# initialise Math class
calculator = Computation()

# initialise Classifier to find the right command Path
classifier = Classifier()


print('Please Speak now')
while True:

    samples = rec.get_samples()

    audio, finalize = vad.process_audio(samples)

    if not audio:
        continue

    logging.debug('decoding audio len=%d finalize=%s audio=%s' % (len(audio), repr(finalize), audio[0].__class__))

    user_utt, confidence = asr.decode(audio, finalize, stream_id=STREAM_ID)
コード例 #44
0
ファイル: online_task.py プロジェクト: zhuxiang/MB_TREC2015
class Controller(threading.Thread):
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]    # 当天相关推文记录,存储共离线分析
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()

    def load_ranker(self):
        self.logger_info.info('loading ranker...')
        gateway = JavaGateway()
        ranker = gateway.entry_point
        self.logger_info.info('ranker loaded!')
        return ranker

    def run(self):
        self.logger_info.info('%s is starting...' % self.name)
        self.threadEvent.wait()
        self.logger_info.info('%s is running...' % self.name)
        # self.schedule.enter(0, 0, self.dump_schedule, ())
        # self.schedule.run()
        self.process()

    def process(self):
        data_file_path = sys.argv[1]
        files = os.listdir(data_file_path)
        files.sort()
        for f in files:
            filename = os.path.join(data_file_path, f)
            logging.info(filename)
            count = 0
            for line in open(filename, 'rb'):
                start = time.clock()
                tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line)
                simhash_value = Simhash(tweet_text).value
                if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                    continue

                topic_id, similarity = self.classifier.classify(tweet_text)
                if topic_id == '':
                    continue

                count += 1
                if count % 10000 == 0:  logging.info('%d' % count)

                tweet_json['similarity'] = similarity
                evaluate_score = self.ranker.predict(json.dumps(tweet_json))
                total_score = (evaluate_score ** 0.5) * similarity
                # if total_score < 0.15:
                #     continue

                timestruct = time.gmtime(int(timestamp[:-3]))
                is_pushed = self.pusher.push(total_score, topic_id, timestruct)
                if is_pushed:
                    delivery_time = float(timestamp) / 1000.0 + (time.clock() - start)
                    self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text])

                utc_time = time.strftime('%Y%m%d', timestruct)
                self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text])

                self.related_tweets_hash.add(simhash_value)
                self.pushed_tweets_ids.add(tid_retweet)
            self.dump_result(f)
            self.pusher = Pusher()
        self.logger_info.info('\n=======finished!=======\n')

    def dump_result(self, file_name):
        self.logger_info.info('saving result...')
        with open('submit/task-b/b_submit', 'a') as fw:
            with open('submit/task-b/b_review/B_candidateday_' + file_name[-2:], 'w') as fw_review:
                for index, records in enumerate(self.related_tweets):
                    pid = str(index+226)
                    sorted_records = sorted(records, key=lambda item: -item[2])
                    for rank, record in enumerate(sorted_records):
                        if rank >= 100:
                            break
                        fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA'))
                        fw_review.write('%s\tMB%s\tQ0\t%s\t%f\tSNACS\t%s\t%s\t%s\n' % (record[0], pid, record[1], record[2], record[3], record[4], record[5]))

        with open('submit/task-a/a_submit', 'a') as fw:
            with open('submit/task-a/a_review', 'a') as fw_review:
                for index, records in enumerate(self.pushed_tweets):
                    pid = str(index+226)
                    for record in records:
                        fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1]))
                        fw_review.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4]))

        self.related_tweets = [[] for _ in range(225)]    # 清空前天相关推文记录
        self.pushed_tweets = [[] for _ in range(225)]


    def dump_schedule(self):
        self.logger_info.info('saving result...')
        utc_time = time.strftime('%Y%m%d', time.gmtime())
        for index, records in self.related_tweets:
            pid = str(index+226)
            with open('profile_MB' + pid, 'w') as fw:
                for record in records:
                    fw.write(utc_time + '\t' + pid + '\tQ0\t' + record + '\n')
        self.related_tweets = [[] for _ in range(226)]    # 清空前天相关推文记录
        self.schedule.enter(24*60*60, 0, self.dump_schedule, ())

    def detect_tweet_stream(self, year, month, d, h, m, s, ms):
        start = datetime.datetime(year, month, d, h, m, s, ms)
        delta = (start - datetime.datetime.now()).seconds
        self.logger_info.info('waiting secondes: ' + str(delta))
        time.sleep(delta)
        self.logger_info.info('tweet stream is ready')
        is_ready = True
        return is_ready
コード例 #45
0
#
# results_file = open('0127_best_result_wiki.txt', 'w')
# results_file.write(" %s\n" %  best_result)
#
# np.savetxt('best_result_wiki.out', best_result, delimiter='\t')  # X is an array

clf_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
node_size = len(vectors)
train_x = np.array([vectors[x] for x in X])

reshaped_train_x = train_x.reshape((train_x.shape[0], args.kstep, node_size))
train_x = low_encoder.predict(reshaped_train_x)

for clf_one in clf_list:
    print "Training classifier using {:.2f}% nodes...".format(clf_one * 100)
    clf = Classifier(vectors=train_x, clf=LogisticRegression())
    clf.split_train_evaluate(X, Y, clf_one)

# y_lable = np.array(Y)
# print(train_x.shape)
# print(type(train_x))
# print(type(y_lable))
#
# np.savetxt('train_x.out', train_x, delimiter='\t')   # X is an array
# np.savetxt('train_Y.out', y_lable.astype(int), delimiter='\t')   # X is an array
#
#
# print(results)
# results_file = open('citeseer_result_0.3.txt', 'w')
# for item in results:
#     results_file.write("%s\n" % item)
コード例 #46
0
ファイル: weib_classify.py プロジェクト: sctennis77/semeval
	def __init__(self,**kargs):
		Classifier.__init__(self,tweets=kargs["tweets"],instances=kargs["instances"],model=kargs["model"],keys=kargs["keys"],selection=kargs["selection"])
		self.polarity_dict = kargs["polarity_dict"]
		self.tag_map = kargs["tag_map"]
		self.id="weib{0},s:{1}".format(self.num_items,self.selection)
		self.prepare_features()
コード例 #47
0
from classify import Classifier

cl = Classifier()
print(cl.classify('./test/'))
コード例 #48
0
ファイル: validation.py プロジェクト: edelmoral/c45-python
class Validator:
    def __init__(self, restrictions):
        self.attributes = []
        self.true_pos = 0
        self.true_neg = 0
        self.false_pos = 0
        self.false_neg = 0

        if len(restrictions) > 0:
            self.restr = restrictions.restr
        else:
            self.restr = restrictions

    def train(self, domain, class_data):
        document = xml.dom.minidom.Document()
        node = document.createElement('Tree')
        document.appendChild(node)
        d = Trainer(domain, class_data, document)
        partial_atts = d.attributes
        partial_atts.remove("Id")
        partial_atts.remove("Vote")
        print partial_atts

        if len(self.restr) > 0:
            d.rem_restrictions(self.restr)

        d.c45(d.data, d.attributes, node, 0)

        self.classifier = Classifier()

        if len(class_data.category) > 0:
            self.classifier.has_category = True

        for row in d.data:
            self.classifier.classify(document.documentElement, row,
                                     class_data.attributes)

        self.classifier.print_stats()

    #def print_stats(self):

    def recall(self):
        TP = self.classifier.true_pos
        FN = self.classifier.false_neg

        return float(TP) / (TP + FN)

    def precision(self):
        TP = self.classifier.true_pos
        FP = self.classifier.false_pos

        return float(TP) / (TP + FP)

    def pf(self):
        TN = self.classifier.true_neg
        FP = self.classifier.false_pos

        return float(FP) / (FP + TN)

    def fmeasure(self):
        beta = 2
        return float(beta * self.precision() *
                     self.recall()) / (self.precision() + self.recall())

    def confusion_matrix(self):
        print "###### CONFUSION MATRIX #######"
        print "                | Classified Positive | Classified Negative |"
        print "Actual Positive |          " + self.classifier.true_pos + "           |          " + self.classifier.false_neg + "           |"
        print "Actual Negative |          " + self.classifier.false_pos + "           |           " + self.classifier.true_neg + "          |"