Ejemplo n.º 1
0
def init_feature_list():

    logging.info("init feature list")

    buf = []

    for col in BIN_COLS:

        buf.append(
            Feature(name=col,
                    prefix=col,
                    startid=1,
                    type=FeatureType.BIN,
                    drop=False))

    for col in VAL_COLS:

        buf.append(
            Feature(name=col,
                    prefix=col,
                    startid=1,
                    type=FeatureType.VAL,
                    drop=False))

    return buf
Ejemplo n.º 2
0
 class D(BaseModel, self.Settings):
     stream = Feature(TextStream, store=True)
     length = Feature(
             CharacterCountNonGeneratorProcessMethod,
             needs=stream,
             store=True)
     total = Feature(Total, needs=length, store=True)
Ejemplo n.º 3
0
def dryer_data2(*feature_names):
	# data[area][genus][(feature_values)] = langauge_count
	data = {}
	# Languages that all features have
	languages = set()
	
	g = Genealogy()
	feature = Feature(feature_names[0])
	
	for language in feature.languages():
		languages.add(language.code)
	
	for feature_name in feature_names:
		feature = Feature(feature_name)
		this_set = set()
		for language in feature.languages():
			this_set.add(language.code)
		
		languages &= this_set
	
	for language_code in languages:
		language = g.find_language_by_code(language_code)
		area = language.area
		genus = language.genus.name
		value = ','.join(v['description'] for v in sorted(language.features.values()))
		
		data.setdefault(area, {})
		data[area].setdefault(genus, {})
		data[area][genus].setdefault(value, 0)
		data[area][genus][value] += 1
	
	return data
Ejemplo n.º 4
0
 class Split(BaseModel, self.Settings):
     stream = Feature(TextStream, store=False)
     uppercase = Feature(
             ToUpper, needs=stream, store=True, persistence=settings1)
     lowercase = Feature(
             ToLower, needs=stream, store=True, persistence=settings2)
     cat = Feature(
             Concatenate, needs=[uppercase, lowercase], store=False)
Ejemplo n.º 5
0
def all_features(parse_dict, constituent, i, constituents):

    syntax_tree = constituent.syntax_tree
    conn_category = Connectives_dict().conn_category
    connective = constituent.connective
    ''' feat dict '''
    feat_dict_CON_Str = {}
    feat_dict_CON_LStr = {}
    feat_dict_NT_Ctx = {}
    feat_dict_CON_NT_Path = {}
    feat_dict_CON_NT_Path_iLsib = {}
    ''' load dict '''
    dict_CON_Str = NT_dict().dict_CON_Str
    dict_CON_LStr = NT_dict().dict_CON_LStr
    dict_NT_Ctx = NT_dict().dict_NT_Ctx
    dict_CON_NT_Path = NT_dict().dict_CON_NT_Path
    dict_CON_NT_Path_iLsib = NT_dict().dict_CON_NT_Path_iLsib
    ''' feature '''
    conn_indices = connective.token_indices
    DocID = connective.DocID
    sent_index = connective.sent_index

    conn_node = dict_util.get_conn_node(syntax_tree, conn_indices)

    CON_Str = dict_util.get_CON_Str(parse_dict, DocID, sent_index,
                                    conn_indices)
    CON_LStr = CON_Str.lower()
    CON_Cat = conn_category[connective.name]
    CON_iLSib = dict_util.get_CON_iLSib(syntax_tree, conn_node)
    CON_iRSib = dict_util.get_CON_iRSib(syntax_tree, conn_node)
    NT_Ctx = dict_util.get_NT_Ctx(constituent)
    CON_NT_Path = dict_util.get_CON_NT_Path(conn_node, constituent)
    CON_NT_Position = dict_util.get_CON_NT_Position(conn_node, constituent)
    if CON_iLSib > 1:
        CON_NT_Path_iLsib = CON_NT_Path + ":>1"
    else:
        CON_NT_Path_iLsib = CON_NT_Path + ":<=1"

    features = []
    features.append(get_feature(feat_dict_CON_Str, dict_CON_Str, CON_Str))
    features.append(get_feature(feat_dict_CON_LStr, dict_CON_LStr, CON_LStr))
    features.append(get_feature(feat_dict_NT_Ctx, dict_NT_Ctx, NT_Ctx))
    features.append(
        get_feature(feat_dict_CON_NT_Path, dict_CON_NT_Path, CON_NT_Path))
    features.append(
        get_feature(feat_dict_CON_NT_Path_iLsib, dict_CON_NT_Path_iLsib,
                    CON_NT_Path_iLsib))
    # cat
    dict_category = {"subordinator": 1, "coordinator": 2, "adverbial": 3}
    features.append(get_feature({}, dict_category, CON_Cat))
    #number
    features.append(Feature("", 1, {1: CON_iLSib}))
    features.append(Feature("", 1, {1: CON_iRSib}))
    #position
    dict_position = {"right": 1, "left": 2}
    features.append(get_feature({}, dict_position, CON_NT_Position))

    return util.mergeFeatures(features)
Ejemplo n.º 6
0
 class D2(BaseModel, self.Settings):
     stream = Feature(TextStream, store=True)
     words = Feature(Tokenizer, needs=stream, store=False)
     count = JSONFeature(WordCount, needs=words, store=True)
     timestamp = JSONFeature(
             TimestampEmitter,
             version='2',
             needs=stream,
             store=True)
Ejemplo n.º 7
0
 class Contrived(BaseModel, self.Settings):
     stream1 = Feature(TextStream, store=False)
     stream2 = Feature(TextStream, store=False)
     t1 = Feature(Tokenizer, needs=stream1, store=False)
     t2 = Feature(Tokenizer, needs=stream2, store=False)
     count1 = JSONFeature(WordCount, needs=t1, store=True)
     count2 = JSONFeature(WordCount, needs=t2, store=True)
     aggregate = JSONFeature( \
             WordCountAggregator, needs=[count1, count2], store=True)
Ejemplo n.º 8
0
 class D1(BaseModel, self.Settings):
     stream = Feature(TextStream, store=True)
     words = Feature(Tokenizer, needs=stream, store=False)
     count = JSONFeature(WordCount, needs=words, store=True)
     timestamp = JSONFeature(
             TimestampEmitter,
             version='1',
             needs=stream,
             store=True)
     validated = Feature(ValidatesDependencies, needs=stream, store=True)
Ejemplo n.º 9
0
def main():
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])

    root = Feature('root')
    featureList = np.array([])
    for i in range(len(X[0])):
        feature = Feature('feature_%d' % i)
        root.transform('init', feature)
        featureList = np.append(featureList, feature)

    model = PCA(n_components=1)
    model.fit(X)
    doWithPCA(model, featureList)
    root.printTree()
Ejemplo n.º 10
0
def NT_curr_next_level_distance(parse_dict, constituent, i, constituents):
    if i == len(constituents) - 1:
        return Feature("", 1, {1: 100})

    curr = constituents[i].node
    next = constituents[i + 1].node

    syntax_tree = constituent.syntax_tree
    root_node = syntax_tree.tree.get_tree_root()

    curr_level = int(syntax_tree.tree.get_distance(root_node, curr))
    next_level = int(syntax_tree.tree.get_distance(root_node, next))

    return Feature("", 1, {1: next_level - curr_level})
Ejemplo n.º 11
0
def main():
    X = [[1, 2], [2, 3]]

    root = Feature('root')
    featureList = np.array([])
    for i in range(len(X[0])):
        feature = Feature('feature_%d' % i)
        root.transform('init', feature)
        featureList = np.append(featureList, feature)

    model = OneHotEncoder(n_values=[5, 8], sparse=True)
    model.fit(X)
    doWithOneHotEncoder(model, featureList)
    root.printTree()
Ejemplo n.º 12
0
def NT_prev_curr_level_distance(parse_dict, constituent, i, constituents):
    if i == 0:
        return Feature("", 1, {1: 100})

    curr = constituents[i].node
    prev = constituents[i - 1].node

    syntax_tree = constituent.syntax_tree
    root_node = syntax_tree.tree.get_tree_root()

    curr_level = int(syntax_tree.tree.get_distance(root_node, curr))
    prev_level = int(syntax_tree.tree.get_distance(root_node, prev))

    return Feature("", 1, {1: curr_level - prev_level})
Ejemplo n.º 13
0
def main():
    from sklearn.feature_selection import VarianceThreshold
    X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
    
    root = Feature('root')
    featureList = np.array([])
    for i in range(len(X[0])):
        feature = Feature('feature_%d' % i)
        root.transform('init', feature)
        featureList = np.append(featureList, feature)

    model = VarianceThreshold()
    model.fit(X)
    doWithSelector(model, featureList)
    root.printTree()
Ejemplo n.º 14
0
    def __init__(self):
        self.train_file = FILE_PATH + '/../data/conll.nonexp.train'
        self.test_file = FILE_PATH + '/../data/conll.nonexp.test'
        self.model_file = FILE_PATH + '/../data/conll.nonexp.model'
        self.predicted_file = FILE_PATH + '/../data/conll.nonexp.test.predicted'

        self.feat_handle = Feature()
Ejemplo n.º 15
0
def prev_curr_some_clause(parse_dict, constituent, i, constituents):
    # feature
    connective = constituent.connective
    DocID = connective.DocID
    sent_index = connective.sent_index

    if (DocID, sent_index) not in dict_clauses:
        clauses_list = dict_util.get_sent_clauses(parse_dict, DocID,
                                                  sent_index)
        dict_clauses[(DocID, sent_index)] = clauses_list
    clauses_list = dict_clauses[(DocID, sent_index)]  #[[1,2],[4,5,6]]
    #为每个constituent ,判断她是否与前面的一个constituent是否处于同一个clause
    prev_curr_some_clause = 0
    if i > 0:
        curr_clause_NO = -1
        for k, item in enumerate(clauses_list):
            if set(constituents[i].indices) <= set(item):
                curr_clause_NO = k
                break
        prev_clause_NO = -1
        for k, item in enumerate(clauses_list):
            if set(constituents[i - 1].indices) <= set(item):
                prev_clause_NO = k
                break

        if curr_clause_NO != -1 and prev_clause_NO != -1 and curr_clause_NO == prev_clause_NO:
            prev_curr_some_clause = 1

    return Feature("", 1, {1: prev_curr_some_clause})
Ejemplo n.º 16
0
    def __init__(self, filename=""):

        # Read file and generate list of features

        self.filename = filename
        if filename != "":
            try:
                if not os.path.isfile(filename):
                    raise GFF_IOError("Could not open file '" + filename +
                                      "': Not a file.")
                f = open(filename, 'r')
            except IOError, s:
                raise GFF_IOError("Could not open file '" + filename + "': " +
                                  str(s))
            lines = f.readlines()
            lines = filter(string.strip, lines)

            features = []
            for l in lines:
                try:
                    f = Feature(l)
                except FeatureComment:
                    pass
                except FeatureInputError, s:
                    sys.stderr.write('Error in feature: ' + str(s) + "  " + l)
                else:
                    features.append(f)
Ejemplo n.º 17
0
    def map(self, f):
        """Return generator of features applied f

        :param f: function takes a feature as an argument.
        """
        for feature in self:
            yield Feature(f(feature))
Ejemplo n.º 18
0
def verbs(relation, parse_dict):
    #load dict
    dict_verb_classes = Non_Explicit_dict().dict_verb_classes

    '''feature'''
    # 1. the number of pairs of verbs in Arg1 and Arg2 from same verb class
    Arg1_words = dict_util.get_Arg_Words_List(relation, "Arg1", parse_dict)
    Arg2_words = dict_util.get_Arg_Words_List(relation, "Arg2", parse_dict)

    count = 0
    for w1, w2 in [(w1.lower(), w2.lower()) for w1 in Arg1_words for w2 in Arg2_words]:
        if w1 in dict_verb_classes and w2 in dict_verb_classes:
            c1 = dict_verb_classes[w1]
            c2 = dict_verb_classes[w2]
            if set(c1.split("#")) & set(c2.split("#")) != set([]):
                count += 1
    feat_1 = Feature("", 1, {1: count})

    #2. POS of main verb
    Arg1_MV_POS = dict_util.get_main_verb_pos(relation, "Arg1", parse_dict)
    Arg2_MV_POS = dict_util.get_main_verb_pos(relation, "Arg2", parse_dict)


    MV_POS_feature_list = Arg1_MV_POS + Arg2_MV_POS

    MV_POS_feature = get_feature_by_list(MV_POS_feature_list)


    return util.mergeFeatures([feat_1, MV_POS_feature])
Ejemplo n.º 19
0
def HarrisCorner(images, isDisplay) : 
    for image in images : 
        features = []
        logging.info(f'IMAGE {image.getImageID():02d}:Applying Harris Corners Detection')
        imageName                     = image.getImageName()
        img                           = cv.imread(imageName)
        gray                          = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
        gray                          = np.float32(gray)
        dst                           = cv.cornerHarris(gray, 2, 3,0.0001)
        ret, dst                      = cv.threshold(dst,0.001*dst.max(),255,0)
        dst                           = np.uint8(dst)
        ret, labels, stats, centroids = cv.connectedComponentsWithStats(dst)
        criteria                      = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 100, 0.001)
        corners                       = cv.cornerSubPix(gray,np.float32(centroids),(5,5),(-1,-1),criteria)

        for corner in corners : 
            if isDisplay : 
                # img[dst>dst.max()]=[0,0,255]
                cv.circle(img, (int(corner[0]), int(corner[1])), 4, (0, 0, 255), -1)
            feature = Feature(corner[0], corner[1], image)
            features.append(feature) 
        if isDisplay : 
            cv.imshow(f'Image {image.getImageID()}', img)
            cv.waitKey(0)
            cv.destroyAllWindows()
        image.setFeatures(features)
Ejemplo n.º 20
0
def readGeonamesFile(filename, feature_codes):
    linecounter = 0

    # contains the features from the file
    features = {}

    # contains the field names from the header row
    fields = {}

    # read the geonames file into dictionary
    logging.debug("processing geonames file")
    geonames_file = open(filename, "r")
    for line in geonames_file.readlines():
        linecounter += 1
        row = line.split("\t")
        # assign first row to the header
        if linecounter == 1:
            # dictionary of field names and their position - from the header row
            logging.debug("processing geonames file header row")
            for i in range(len(row)):
                # strip newline from the field name (last field)
                fields.update({row[i].strip(): i})
        else:
            # construct the feature object
            feature = Feature(fields, row)
            # lookup the name of corresponding to the feature code and class
            feature.featureName = feature_codes.get(feature.featureClassCode)
            features.update({feature.geonameid : feature})
    geonames_file.close()
    return features
Ejemplo n.º 21
0
 def __init__(self, K, middle_state=None):
     self.K = K
     self.middle_state = middle_state
     self.patient_cluster = dict()  # pat_id: cluster index
     self.patient_info = dict()  # pat_id: Patient
     self.patient_id = list()  # pat_id
     self.feature = Feature()  # store
Ejemplo n.º 22
0
    def transform_single(self, f):
        st = feature_summary(f.data)

        return Feature.merge_instances(
            f,
            Feature(f.name, f.data, st)
        )
Ejemplo n.º 23
0
def load_feat_dataset(data_dir):

    labels = os.path.join(data_dir, "labels")
    label_imgs = get_filelist(labels, ".png")
    poses = load_pose(os.path.join(data_dir, "poses.txt"))

    img_name_dict = {}
    for i in range(len(label_imgs)):
        img_name = os.path.split(label_imgs[i])[-1][:-4]
        img_name_dict[img_name] = label_imgs[i]

    seg_feature_dataset = []
    cnt, step = 0, 1
    for i in tqdm(range(len(poses))):
        pose = poses[i]
        if i == 0:
            x_last, y_last = pose.data[0], pose.data[1]
        acc_veh_odm = np.sqrt((pose.data[0] - x_last)**2 +
                              (pose.data[1] - y_last)**2)
        x_last, y_last = pose.data[0], pose.data[1]
        if pose.name in img_name_dict:
            if cnt % step == 0:
                _img_path = img_name_dict[pose.name]
                _hist_l, _hist_c, _hist_r = load_label_img(_img_path)
                _feature = Feature()
                _feature.parse(pose, _hist_l, _hist_c, _hist_r, acc_veh_odm)
                seg_feature_dataset.append(_feature)
            cnt += 1

    return seg_feature_dataset
Ejemplo n.º 24
0
def eval_simple_agents():
    """ Run simple baselines on each split. """
    for split in ["train", "val_seen", "val_unseen"]:
        env = R2RBatch(
            Feature(None, False),
            False,
            False,
            6,
            False,
            "lstm",
            batch_size=1,
            splits=[split],
            tokenizer=None,
        )
        ev = Evaluation([split], encoder_type="lstm")  #  subgoal=False)

        for agent_type in ["Stop", "Shortest", "Random"]:
            outfile = "%s%s_%s_agent.json" % (RESULT_DIR, split,
                                              agent_type.lower())
            agent = BaseAgent.get_agent(agent_type)(env, outfile)
            agent.test()
            agent.write_results()
            score_summary, _ = ev.score(outfile)
            print("\n%s" % agent_type)
            pp.pprint(score_summary)
Ejemplo n.º 25
0
def clause_word_num(arg_clauses, clause_index, parse_dict):
    # load dict

    # feature
    clause_word_num = len(arg_clauses.clauses[clause_index][0])

    return Feature("", 1, {"1": clause_word_num})
Ejemplo n.º 26
0
def main(args):
    # Load configuration
    config = Configuration(args.yaml_path)

    print("Loading Probase...")
    probase = Probase(config)

    print("Loading dataset...")
    dataset = Data(config)

    print("Loading NLP utility...")
    nlp = NLP('en')

    print("Loading feature extractor...")
    features = Feature(config, probase, nlp=nlp)

    print("Extracting vector features")
    features.extract_vector_features(dataset)

    print("Extracting statistical vector features")
    features.extract_statistical_features(dataset)

    print("Evaluating clasifiers")
    ev = Evaluation(config, dataset)
    ev.full_evaluation(features.X, features.y)
Ejemplo n.º 27
0
def gen_feature_pool_from_array(fm, d):
    c_id = 0
    for f in fm:
        yield Feature(f.name, d[:, c_id], f.st)
        c_id += 1
    assert c_id == d.shape[1], \
        "Result feature pool and given array didn't match: {} != {}".format(c_id, d.shape[1])
Ejemplo n.º 28
0
    def guess_three(self, season, round_num):

        feature = Feature(4)
        feature.prepare_team_ranking()

        with codecs.open('input/yingchao_result.csv', 'rb', 'utf-8') as file:
            next(file)
            for row in file:
                row = row.strip('\r\n')
                columns = row.split(',')

                if season == int(columns[0]) and round_num == int(
                        columns[2]) and int(columns[1]) == 4:
                    home_team = Feature.team_name_clear(columns[3])
                    guest_team = Feature.team_name_clear(columns[4])

                    home_key = Feature.create_team_id('2017', columns[1],
                                                      home_team)
                    guest_key = Feature.create_team_id('2017', columns[1],
                                                       guest_team)

                    home_ranking = feature.get_team_ranking(home_key)
                    guest_ranking = feature.get_team_ranking(guest_key)
                    diff_ranking = home_ranking - guest_ranking

                    print home_team + '\t' + guest_team + '\t' + str(
                        home_ranking) + '\t' + str(guest_ranking) + '\t' + str(
                            diff_ranking)
Ejemplo n.º 29
0
    def __init__(self):
        self.feature = Feature(None,
                               None,
                               color_space=color_space,
                               spatial_size=spatial_size,
                               hist_bins=hist_bins,
                               orient=orient,
                               pix_per_cell=pix_per_cell,
                               cell_per_block=cell_per_block,
                               hog_channel=hog_channel,
                               spatial_feat=spatial_feat,
                               hist_feat=hist_feat,
                               hog_feat=hog_feat,
                               show_debug=False)

        self.feature.learn()
        # scaler to X
        scaled_X, self.X_scaler = self.feature.get_scaled_X()

        # Define the labels vector
        y = self.feature.get_label_vector()

        # Split up data into randomized training and test sets
        rand_state = np.random.randint(0, 100)
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_X, y, test_size=0.2, random_state=rand_state)

        print('Using:', orient, 'orientations', pix_per_cell,
              'pixels per cell and', cell_per_block, 'cells per block')
        print('Feature vector length:', len(X_train[0]))
        # Use a linear SVC
        self.c = Classifier('SVC')
        self.c.run(X_train, y_train)
        print('SVC Accuracy: ', self.c.getAccuracy(X_test, y_test))
Ejemplo n.º 30
0
 def get_feature(self, name):
     try:
         redis = Redis(connection_pool=self.redis_pool)
         return Feature(name,self.__get_feature_percentage(name), redis.smembers(f"feature:{name}:users"))
     except Exception as e:
         logger.exception(
             f"[PyCurtain] Redis error while getting data from feature [{name}]", e)