def test_avg(iterations, test_file, beam_size): data = prepare_data.read_file(test_file) feature = Feature() decoder = Decoder(beam_size, feature.get_score) count = 0 data_size = len(data) model_file = open( '/home/xzt/CWS/model_result/avg-model_beam-size-' + str(beam_size) + '.pkl', 'rb') feature.load_model(model_file) model_file.close() for line in data: z = decoder.beamSearch(line) seg_data = ' '.join(z) seg_data_file = '/home/xzt/CWS/test_seg_data/avg-test-seg-data' + '_beam-size-' + str( beam_size) + '.txt' with open(seg_data_file, 'a') as f: f.write(seg_data + '\n') count += 1 if count % 1000 == 0: print("segment with avg-model, finish %.2f%%" % ((count / data_size) * 100)) f.close() print("segment with avg model finish")
def create_all_relations_and_features(self): """Creating all possible relations between all entities. Creating features here for performance. """ all_relations = [] feature = None for source in self.events + self.timex: for target in self.events + self.timex: for i, time in enumerate(RelationType()): new_relation = Relation("all", self, source, target, time) # We don't have the feature yet if i == 0: f = Feature(new_relation) feature = f.get_feature() if new_relation in self.relations: continue else: new_relation.set_feature(feature) all_relations.append(new_relation) feature = None self.relations = self.relations + all_relations
def build_features(self, image_shape): height, width = image_shape features = [] # TODO: play with minimum feature size for w in range(1, width+1): for h in range(1, height+1): x = 0 while x + w < width: y = 0 while y + h < height: # 2 horizontally aligned blocks root = Region(x,y,w, h) right = Region(x+w, y, w,h) # check if the VJ feature can be fit into the image if x + 2 * w < width: features.append(Feature([right], [root])) bottom = Region(x, y+h, w, h) # 2 vertically aligned blocks if y + 2 * h < height: features.append(Feature([root],[bottom])) # 3 horizontally aligned blocks right2 = Region(x+2*w, y, w,h) if x + 3 * w < width: features.append(Feature([right], [right2, root])) cross_bottom = Region(x+w, y+h, w, h) if x + 2 * w < width and y + 2 * h < height: features.append(Feature([right, bottom], [root, cross_bottom])) y += 1 x += 1 return features
def flw_dataset_classify(): f = Feature() paths, classes = loadFaceData('face.csv', nrows=82) X = [] y = [] for index, path in enumerate(paths): ar = f.getFeature(path) print(index, path) if ar.all() == 0: continue X.append(ar) y.append(classes[index]) X = np.array(X) y = np.array(y) print(X.shape) print(X) print(y) X_train_data, X_test_data, y_train_data, y_test_data = train_test_split( X, y, test_size=0.3, stratify=y) nearestCentroid = NearestCentroid() nearestCentroid.fit(X_train_data, y_train_data) predict_y = nearestCentroid.predict(X_test_data) acc = accuracy_score(y_test_data, predict_y) print(acc)
def scut_fbp_test(): f = Feature() # af1and5 0.890287769784 paths, classes = loadFaceData( './dataset/af1and5.csv', nrows=100) # './dataset/all(round_score).csv' for full class X = [] y = [] for index, path in enumerate(paths): ar = f.getFeature(path) print(index, path) if ar.all() == 0: continue X.append(ar) y.append(round(classes[index])) X = np.array(X) y = np.array(y) print(X.shape) print(X) print(y) X_train_data, X_test_data, y_train_data, y_test_data = train_test_split( X, y, test_size=0.3, stratify=y) nearestCentroid = NearestCentroid() nearestCentroid.fit(X_train_data, y_train_data) predict_y = nearestCentroid.predict(X_test_data) acc = accuracy_score(y_test_data, predict_y) print(acc)
def tf_idf_training(comment_cnt_lower_bound, train_ratio): # train_set, cv_set = Train.simple_partition(comment_cnt_lower_bound, \ # train_ratio) # idf_dict = Feature.cal_idf(train_set, Config.train_idf_path) train_set = Train.get_train_set() Feature.cal_tf_idf(train_set, Config.train_idf_path, Config.train_tf_idf_path, True, 200) Genome.cal_tf_idf(Config.train_tf_idf_path, \ Config.train_genome_tf_idf_path, 200)
def decorate_outside(obj, options=Map()): obj.points = [] obj.points_edges = [] obj.material_clear = Blocks.AIR border = flatten_list_of_lists( [vg.get_line_from_points(l[0], l[1]) for l in options.lines]) if options.options.outside == "flowers": flowers_1 = [] flowers_2 = [] for i, b in enumerate(border): # TODO: Refactor to have multiple numbers of flowers if (i % 2) == 0: flowers_1.append(b) else: flowers_2.append(b) colors = Blocks.kind("Flower") np.random.shuffle(colors) obj.features.append( Feature("flowers", flowers_1, Map(material=colors[0]))) obj.features.append( Feature("flowers", flowers_2, Map(material=colors[1]))) elif options.options.outside == "trees": trees = [] for i, b in enumerate(border): if (i % 3) == 0: trees.append(b) colors = Blocks.kind("Sapling") np.random.shuffle(colors) obj.features.append(Feature("flowers", trees, Map(material=colors[0]))) elif options.options.outside == "grass": trees = [] for i, b in enumerate(border): if (i % 3) == 0: trees.append(b) obj.features.append( Feature("flowers", trees, Map(material=Blocks.DOUBLETALLGRASS))) elif options.options.outside == "fence": fence_type = np.random.random_integers(188, 192) obj.features.append(Feature("fence", border, Map(material=fence_type))) return obj
def feature_detection(S_ana_log): indices = np.argwhere( ~np.isnan(S_ana_log)) # Find all non-NaN indices in S_ana_log #print(np.shape(indices)) Features_list = [] # initialize Features_list for [x_ind, y_ind] in indices: # For each pair of indices (pixel) if len(Features_list) == 0: # if len of feature_list is 0 newFeature = Feature(x_ind, y_ind) # create a new feature Features_list.append(newFeature) # add new feature to feature_list else: border_list = [ ] # initialize list of logicals on where a Feature borders current pixel sublist = [ ] # initialize list of neighboring Features that boarder current pixel for currentFeature in Features_list: # for each feature in list border_list.append(currentFeature.borders( x_ind, y_ind)) # find all features that boarders the pixel indslist = np.where(border_list)[0] if len(indslist ) == 1: # if current pixel boarders the exactly 1 Feature hunterFeature = Features_list[indslist[0]] # find that Feature hunterFeature.add(x_ind, y_ind) # add current pixel to that Feature if len(indslist ) > 1: # if the current pixel boarders more than 1 Feature for ind in indslist: sublist.append( Features_list[ind]) # add those Features to a list for s in sublist: Features_list.remove( s) # remove those Features from Features_list hunterFeature = conjoin( sublist) # conjoin all Features in sublist hunterFeature.add( x_ind, y_ind) # add current pixel to conjoined Feature Features_list.append( hunterFeature) # add conjoined Feature to Feature_list else: # if current pixel does not boarder any existing features newFeature = Feature(x_ind, y_ind) # create a new feature Features_list.append( newFeature) # add new feature to feature_list return (Features_list, indices)
def saveWordNetFeatures(self, fileOut): feature = Feature() synSynToScore = {} xuidPairs = self.getAllXUIDPairs() print("calculating wordnet features for", len(xuidPairs), "unique pairs") i = 0 completed = set() for xuid1, xuid2 in xuidPairs: uid1 = self.corpus.XUIDToMention[xuid1].UID uid2 = self.corpus.XUIDToMention[xuid2].UID if (uid1, uid2) in completed or (uid2, uid1) in completed: continue completed.add((uid1, uid2)) textTokens1 = self.corpus.XUIDToMention[xuid1].text textTokens2 = self.corpus.XUIDToMention[xuid2].text bestScore = -1 for t1 in textTokens1: syn1 = wn.synsets(t1) if len(syn1) == 0: continue syn1 = syn1[0] for t2 in textTokens2: syn2 = wn.synsets(t2) if len(syn2) == 0: continue syn2 = syn2[0] curScore = -1 if (syn1, syn2) in synSynToScore: curScore = synSynToScore[(syn1, syn2)] elif (syn2, syn1) in synSynToScore: curScore = synSynToScore[(syn2, syn1)] else: # calculate it curScore = wn.wup_similarity(syn1, syn2) # don't want to store tons. look-up is cheap synSynToScore[(syn1, syn2)] = curScore if curScore != None and curScore > bestScore: bestScore = curScore feature.addRelational(uid1, uid2, bestScore) i += 1 if i % 1000 == 0: print("\tprocessed", i, "of", len(xuidPairs), "(%2.2f)" % float(100.0 * i / len(xuidPairs)), end="\r") pickle_out = open(fileOut, 'wb') pickle.dump(feature, pickle_out) print("")
def onLine(): analysis = Analysis('../data/all.csv') analysis.dataDistribution() feature = Feature() feature.newFeature() feature.categoryNumerical('../data/train.csv', '../data/test.csv') #train = TrainAndPredict('../data/rf/train.csv', '../data/rf/validation.csv', '../data/test.csv') train = TrainAndPredict('../data/rf/one_hot_train.csv', '../data/rf/one_hot_validation.csv', '../data/test.csv') #train.gbdtClassifier() #train.gbdtRegressor() #train.linearRegression() #train.logisticRegression() train.xgbost()
def create_features(self, img_height, img_width, min_feature_width, max_feature_width, min_feature_height, max_feature_height): features = [] print('Creating feature ...') for feature in FeatureTypes: feature_start_width = max(min_feature_width, feature[0]) for feature_width in range(feature_start_width, max_feature_width, feature[0]): feature_start_height = max(min_feature_height, feature[1]) for feature_height in range(feature_start_height, max_feature_height, feature[1]): for x in range(img_width - feature_width): for y in range(img_height - feature_height): features.append(Feature(feature, (x, y), feature_width, feature_height, 0, 1)) features.append(Feature(feature, (x, y), feature_width, feature_height, 0, -1)) print('..done. ' + str(len(features)) + ' features created.') return features
def getSample(sentence): write_file_1 = open("uni.test.literal","a") write_file_2 = open("bi.test.literal","a") concept_list = reorder_concepts(sentence.concepts) params = [] for concept in concept_list[0]: params.append((concept,[])) for concept in concept_list[1]: params.append((concept,[])) for function in concept_list[2]: if function.param_num == 1: for param in params: params.append((function, [param])) else: for i in range(0,len(params)): for j in range(i+1,len(params)): if not function_filter(function, params[i], params[j]): params.append((function, [params[i],params[j]])) samples = [] for predicate in concept_list[3]: if predicate.param_num == 1: for param in params: samples.append((predicate, [param])) # lack [] initially else: for i in range(0,len(params)): for j in range(i+1,len(params)): if not binary_filter(predicate, params[i], params[j]): samples.append((predicate,[params[i],params[j]])) if order_matter(predicate): if not binary_filter(predicate, params[j], params[i]): samples.append((predicate,[params[j],params[i]])) for sample in samples: print sentence.text print sample[0].name+"(", if len(sample[1]) == 1: print sample[1][0][0].name+ "-" +str(sample[1][0][0].token_id) +")" sam = Feature([sample[0],sample[1][0][0]], sentence) features = sam.generateFeature() write_file_1.write(sentence.text+"\n"+sample[0].name + "-" + str(sample[0].token_id) + "(" + sample[1][0][0].name+ "-" +str(sample[1][0][0].token_id) +")\t"+convert_features(features)+"\n") else: print sample[1][0][0].name + "-" + str(sample[1][0][0].token_id) +","+sample[1][1][0].name + "-" + str(sample[1][1][0].token_id)+")" sam = Feature([sample[0],sample[1][0][0],sample[1][1][0]], sentence) features = sam.generateFeature() write_file_2.write(sentence.text+"\n"+sample[0].name+ "-" + str(sample[0].token_id)+"("+sample[1][0][0].name + "-" + str(sample[1][0][0].token_id) +","+sample[1][1][0].name + "-" + str(sample[1][1][0].token_id)+")\t"+convert_features(features)+"\n") #print sample, features print features
def parse_all_docs(self): fob = Feature(self.exp) cob = Category() cob.get_category_done_list() stime = time.time() print "Parsing documents in " + self.exp print "Start time: " + time.ctime(stime) with open(self.listing_document_path, 'r') as f: lines = [line.strip() for line in f] lines = [line for line in lines[2:] if line] document_id = '' for line in lines: elements = line.split(' ') if elements[0] == '#': document_id = elements[3] continue c_id, category = self._get_category(elements[0]) if category == 'NA': continue if c_id in cob.category_done_list and \ not self.ignore_duplicate_category: continue sample = self._get_features(fob, elements[1:]) sample['Category'] = category sample['Id'] = document_id self.samples = self.samples.append(sample, ignore_index=True) print "Document #" + document_id + " parsed" self.samples = self.samples.fillna(0) etime = time.time() print "Documents in " + self.exp + ' parsed' print "End time: " + time.ctime(etime) print "Time taken: " + str(etime - stime) + " seconds" cob.update_category_done_list([self.category_id1, self.category_id2]) fob.destroy_list()
def add_alters_to_ego_net(ego_net, alter_features_file, ego_net_features): ''' ego_net: object of EgoNet class alter_features_file: file the user inputs ego_net_features: used to access feature dictionary function splits information in file gets the feature name and value using the different classes calls add_feature function from Node class and adds that to a node uses add_alter_node function from EgoNet class using ego_net object returns the ego_net object ''' for line in alter_features_file: #goes through each line in the file pieces = line.split() #splits and creates list with information node_id = int(pieces[0]) #gets the id new_node = Node(node_id, len( pieces[1:])) #calls Node class using id and other information for i, j in enumerate( pieces[1:] ): #uses enumerate to get index and value of the information feature_name = ego_net_features[i][1] #gets feature name feature_value = ego_net_features[i][0] #gets feature value feature_object = Feature( feature_name, feature_value, int(j)) #calls Feature class to get information new_node.add_feature( i, feature_object ) #calls the add_feature funtion from the Node class and adds that to a node ego_net.add_alter_node( new_node ) #uses add_alter_node function from EgoNet class using ego_net object return ego_net #returns the ego_net object
def add_alters_to_ego_net(ego_net, alter_features_file, ego_net_features): ''' Iterates through each line in the features_file using for loop Splits each line into a list separated by spaces Isolates the alter_id, and the alter values in the line_list Creates a Node object using the alter_id and the number of features For each value in the alter_values Use the alter add_feature method to add features to that alter Add the node/alter to the ego_net Returns: ego_net ''' #Iterates through each line in the feature_fil;e for line in alter_features_file: #Splits line into a list a_list = line.split() #Isolates values alter_id = int(a_list[0]) line_list = a_list[1:] #Creates an node object and assigning it to alter alter = Node(alter_id, len(line_list)) #Iterates through each value in the alter_values list for i, digit in enumerate(line_list): # in order to add a feature we must create a Feature instance alter.add_feature( i, Feature((ego_net_features[i][1]), ego_net_features[i][0], int(digit))) #Add the alter to the ego_net ego_net.add_alter_node(alter) return ego_net
def train_opinion_tokens(): comment_cnt_dict = {} token_df_dicts = {} processed_file_cnt = 0 for root, dir, files in os.walk(Config.short_comment_path): for file_name in files: douban_id = file_name comments = DoubanComment.get_comments(douban_id) for comment in comments: rating = comment['rating'] if rating == 50 or rating == 10: comment_cnt_dict[rating] = \ comment_cnt_dict.get(rating, 0) + 1 token_df_dicts[rating] = token_df_dicts.get(rating, {}) valid_tokens_set = Feature.get_valid_tokens(\ comment['comment']) for token in valid_tokens_set: token_df_dicts[rating][token] = \ token_df_dicts[rating].get(token, 0) + 1 processed_file_cnt += 1 print 'processed %s files' % processed_file_cnt # if processed_file_cnt >= 1000: # break for rating, comment_cnt in comment_cnt_dict.items(): token_df_dict = token_df_dicts[rating] token_df_list = sorted(token_df_dict.items(), key=lambda x: -x[1]) output_obj = open(os.path.join(Config.opinion_path, \ str(rating)), 'w') output_obj.write('%s\n' % comment_cnt) for token, df in token_df_list: output_obj.write('%s\t%s\n' % (token.encode('utf8'), df))
def add_feature(self, feature, **kwargs): """ add_feature(self, feature, **args) o feature Bio.SeqFeature object o **kwargs Keyword arguments for Feature. Named attributes of the Feature Add a Bio.SeqFeature object to the diagram (will be stored internally in a Feature wrapper """ id = self.next_id # get id number self.features[id] = Feature(self, id, feature) # add feature for key in kwargs: if key == "colour" or key == "color": #Deal with "colour" as a special case by also mapping to color. #If Feature.py used a python property we wouldn't need to call #set_color explicitly. However, this is important to make sure #every color gets mapped to a colors object - for example color #numbers, or strings (may not matter for PDF, but does for PNG). self.features[id].set_color(kwargs[key]) continue setattr(self.features[id], key, kwargs[key]) self.next_id += 1 # increment next id
def prepare(self, scales): # const vector<Size>& scales # Initialize test locations for features totalFeatures = self.nstructs * self.structSize for i in range(len(scales)): tmp = [] self.features.append(tmp) for i in range(totalFeatures): x1f = random.random() x2f = random.random() y1f = random.random() y2f = random.random() for j in range(len(scales)): # scales[j][0] = width, scales[j][1] = height x1 = x1f * scales[j][0] y1 = y1f * scales[j][1] x2 = x2f * scales[j][0] y2 = y2f * scales[j][1] self.features[j].append(Feature(x1, y1, x2, y2)) # Thresholds self.thrN = 0.5 * self.nstructs # Initialize Posteriors # positives = Pcounter, negatives = Ncounter for i in range(self.nstructs): self.posteriors.append([0] * pow(2, self.structSize)) self.pCounter.append([0] * pow(2, self.structSize)) self.nCounter.append([0] * pow(2, self.structSize))
def __scaffold_contigs(self, contig_ids=None): seq = str(self.get_original_seq()).upper() s_id = self.get_name() slen = len(seq) i = c_start = 0 contig_count = 0 value = 1 id = None last_contig = 0 while True: i = seq.find('N', i) if i < 0: break # count consecutive Ns n_start = i while i < slen and seq[i] == 'N': i += 1 # this many Ns in a row constitute a contig break (gap) n_len = i - n_start if n_len >= self.minGapSize: c_len = n_start - c_start if c_len >= self.minConSize: id = s_id + "_c" + str(contig_count + 1) if contig_ids: id = contig_ids[contig_count] self.contigs.append(Feature(c_start, n_start, value, id)) contig_count += 1 last_contig = n_start elif contig_count == 0: self.seq_start = i c_start = i #contig_count += 1 if last_contig < slen: if slen - c_start > self.minConSize: id = s_id + "_c" + str(contig_count + 1) if contig_ids: id = contig_ids[contig_count] self.contigs.append(Feature(c_start, slen, value, id)) else: self.seq_end = last_contig self.get_contig_lengths_list() assert self.get_contig_length() + self.get_gap_length( ) == self.get_length()
def __init__(self, symbole, **traits): self.__symbole = symbole self.__traits = set(map(lambda x: Feature(x[0], x[1]), traits.items())) recup = self.__memory.get(symbole) if recup == traits: print('Cette combinaison traits-symbole existe déjà.') else: self.__memory[symbole] = traits
def movie_genome_sim(douban_id, genome_id, movie_tf_idf_path, \ genome_tf_idf_path): movie_tf_idf_dict = dict(Feature.get_tf_idf_from_file(douban_id, \ movie_tf_idf_path)) genome_tf_idf_dict = dict(Genome.get_tf_idf_from_file(genome_id, \ genome_tf_idf_path)) return Tagging.cos_sim([movie_tf_idf_dict, genome_tf_idf_dict])
def tf_2_tfidf(tf_dict): idf_dict = Feature.get_idf_dict() if len(tf_dict) > 0: tf_idf_list = map(lambda x: (x[0], idf_dict[x[0]]*x[1]), \ tf_dict.items()) sorted_list = sorted(tf_idf_list, key=lambda x: -x[1]) return dict(sorted_list[:100]) else: return {}
def _feature(self, i): def context(input): line = input.words[input.index][2] inputId = (input.input.path, line[0]) if inputId != self.cachedInputId: raise Exception('Unexpected call to feature') return self.cachedScopeChain[self.cachedDepth - 1 - i] return Feature('{}scope'.format(i), context, word)
def __init__(self,csvFileName): self.csvFileName = csvFileName self.table = [] df = pd.read_csv(csvFileName) t_flag = 1 index= [] for i in range(1,len(df)+1): index.append(i) for col in df.columns: if df[col].dtype == "float64" or df[col].dtype == "int64": f = Feature(col,df[col].values) self.table.append(f) b = f.getSampels() == index if b.all(): t_flag = 0 if t_flag: f = Feature("TimeStamp",index) self.table.append(f)
def preprocess(paths, classes): f = Feature() X = [] y = [] start = time.clock() for index, path in enumerate(paths): print('Preprocessing', index, path) ar = f.getFeature(path) if ar.all() == 0: continue X.append(ar) y.append(classes[index]) test_time = time.clock() - start print("Preprocessing Total time: {0:.2f}".format(test_time)) X = np.array(X) y = np.array(y) return X, y
def LoadDigitData(self, file_path): feature = [] for line in open(file_path): line = line.strip() line_feature = [ord(ch) - ord('0') for ch in line] feature.extend(line_feature) self.dim = len(feature) return Feature(np.array(feature))
def create_feature(feature_description): feature_name = feature_description[0] if "{" in feature_description[1] and "}" in feature_description[1]: feature_type = "CATEGORICAL" else: feature_type = "NUMERIC" feature_possible_values = (feature_description[1].replace("{", "").replace("}", "")).split(",") feature = Feature(feature_name, feature_type, feature_possible_values) # append feature to features_list features_list.append(feature)
def getVisualVector(imset): try: from Feature import Feature, FeatureType feature = Feature.factory(type=FeatureType.GIST, im_set=[imset]) feature.process() return feature.results except Exception as e: print(e) return None
def decorate_wall(obj, options): if options.options.windows == "window_line": spaced_points = vg.extrude( obj.bottom(), Map(spacing=V3(0, math.ceil(obj.height / 2), 0))) for vec in spaced_points: obj.features.append(Feature("window", vec, options=options.options)) elif options.options.windows == "window_line_double": spaced_points = vg.extrude( obj.bottom(), Map(spacing=V3(0, math.ceil(obj.height / 2), 0))) spaced_points2 = vg.extrude(spaced_points, Map(spacing=V3(0, 1, 0))) for vec in spaced_points + spaced_points2: obj.features.append(Feature("window", vec, options=options.options)) elif options.options.windows == "window_slits": spaced_points = vg.points_spaced(obj.bottom(), Map(every=5)) spaced_points = vg.extrude( spaced_points, Map(spacing=V3(0, math.ceil(obj.height / 2), 0))) spaced_points2 = vg.extrude(spaced_points, Map(spacing=V3(0, 1, 0))) for vec in spaced_points + spaced_points2: obj.features.append(Feature("spacing", vec)) else: spaced_points = vg.points_spaced(obj.bottom(), Map(every=3)) spaced_points = vg.extrude( spaced_points, Map(spacing=V3(0, math.ceil(obj.height / 2), 0))) for vec in spaced_points: obj.features.append(Feature("window", vec, options=options.options)) mid_points = vg.middle_of_line(obj.bottom(), Map(center=True, max_width=2, point_per=10)) for vec in mid_points: obj.features.append( Feature( "door", vec, Map(cardinality=obj.cardinality, door_inside=options.options.door_inside))) return obj
def _get_feature(self, text_obj): """Get feature data for a whole text object.""" try: for relation in text_obj.relations: if relation.is_event_event(): f = Feature(relation, strings_cache_g, nlp_persistence_obj_g, duration_cache_g, discourse_cache_g, features_event_event_g) feature = f.get_feature() relation.set_feature(feature) elif relation.is_event_timex(): f = Feature(relation, strings_cache_g, nlp_persistence_obj_g, duration_cache_g, discourse_cache_g, features_event_timex_g) feature = f.get_feature() relation.set_feature(feature) # Append feature to relation in text_obj.relations_plain if existant if relation.is_event_event() or relation.is_event_timex(): if relation in text_obj.relations_plain: # Search for relation for rel in text_obj.relations_plain: if rel == relation: rel.set_feature(feature) break # Print progress with _counter_lock: _counter.value += 1 sys.stdout.write("\r%d%%" % int(_counter.value*100/(_length - 1))) sys.stdout.flush() return text_obj except Exception as e: # Print progress with _counter_lock: _counter.value += 1 sys.stdout.write("\r%d%%" % int(_counter.value*100/(_length - 1))) sys.stdout.flush() print e print traceback.format_exc()
def cal_tf(genome_id): genome_movie_dict = Genome.load_genome_movie() movie_set = genome_movie_dict.get(genome_id, set()) #print '%s have %s movies' % (genome_id, len(movie_set)) genome_tf_dict = {} for douban_id in movie_set: movie_tf_dict = Feature.get_tf_from_file(douban_id) #print '%s have %s terms' % (douban_id, len(movie_tf_dict)) for term, freq in movie_tf_dict.items(): genome_tf_dict[term] = genome_tf_dict.get(term, 0) + freq return genome_tf_dict
def __init__(self, buffer, language): """ Constructor. buffer -- the associated TextBuffer language -- the spell checking language """ Feature.__init__(self, buffer) if type(language) != type([]): language = [language] self.dicts = [] self.changed_lines = [] self.tag = buffer.create_tag('incorrect', underline = pango.UNDERLINE_SINGLE, foreground = 'red') buffer.connect('insert-text', self._on_buffer_insert_text) buffer.connect('delete-range', self._on_buffer_delete_range_after) buffer.connect('changed', self._on_buffer_changed) for lang in language: self.dicts.append(enchant.Dict(lang))
def add_ego_net_features_to_ego(ego, ego_feature_file, ego_net_features): '''Reads a one-line file of features for the ego node''' line_list = ego_feature_file.readline().split() # read one line # i is the index, digit is the value for i, digit in enumerate(line_list): # in order to add a feature we must create a Feature instance ego.add_feature( i, Feature(ego_net_features[i][1], ego_net_features[i][0], int(digit))) return ego
def get_sentences(self, seg_result): sentences = [] start = 0 end = 0 words = seg_result['ret'] while end < len(words): sentence, end = self.get_first_sentence(words, start) if len(Feature.get_verb_noun(sentence)) > 0: sentences.append(sentence) start = end return sentences
def __init__(self, buffer, language): """ Constructor. buffer -- the associated TextBuffer language -- the spell checking language """ Feature.__init__(self, buffer) if type(language) != type([]): language = [language] self.dicts = [] self.changed_lines = [] self.tag = buffer.create_tag('incorrect', underline=pango.UNDERLINE_SINGLE, foreground='red') buffer.connect('insert-text', self._on_buffer_insert_text) buffer.connect('delete-range', self._on_buffer_delete_range_after) buffer.connect('changed', self._on_buffer_changed) for lang in language: self.dicts.append(enchant.Dict(lang))
def main(): train_email_data = EmailData() train_email_data.load_from_file('data/train') feature = Feature() feature.learn(train_email_data) train_data_set = feature.translate_email_data(train_email_data) #print(feature.features) naive_bayesian = NaiveBayesian() naive_bayesian.learn(feature, train_data_set) test_email_data = EmailData() test_email_data.load_from_file('data/test') test_data_set = feature.translate_email_data(test_email_data) print('# Training set') test(naive_bayesian, train_data_set) print('# Testing set') test(naive_bayesian, test_data_set)
def __init__(self, buffer): """ Constructor. buffer -- the associated TextBuffer """ Feature.__init__(self, buffer) self.bullet_point = u'•' self.lock_signals = None self.start_tag = buffer.create_tag('list-start', #foreground = 'lightblue', left_margin = 30, pixels_above_lines = 12) self.bullet_tag = buffer.create_tag('list-bullet', #background = 'orange', left_margin = 30) self.list_tag = buffer.create_tag('list', #underline = pango.UNDERLINE_SINGLE, left_margin = 30, pixels_above_lines = 3) buffer.connect_after('insert-text', self._on_buffer_insert_text_after) buffer.connect('delete-range', self._on_buffer_delete_range) buffer.connect('mark-set', self._on_buffer_mark_set)
phage_as_gta.append(testNames[r]) else: #gta as virus gta_as_phage.append(testNames[r]) # if not MINI: # print("\nPhages (%d) misclassified over %d reps: %s" % (len(phage_as_gta), nrep, phage_as_gta)) # print("\nGTA (%d) misclassified over %d reps: %s\n" % (len(gta_as_phage), nrep, gta_as_phage)) return (score0/nrep, score1/nrep) if __name__ == '__main__': # Load profiles gta_profs = Loader.load(GTA_PATH, "GTA") viral_profs = Loader.load(VIRAL_PATH, "virus") # Make features feats = Feature(gta_profs.profiles + viral_profs.profiles) # kmer feats.make_kmer_dict(K) feats.kmer_feat() # pseaac feats.pseaac(lam=LAM, weight=PSE_WEIGHT) # physicochem feats.physicochem() # Xval # predictor = KNeighborsClassifier(n_neighbors=10) predictor = MultinomialNB() result = xval(predictor, gta_profs, viral_profs, NFOLDS, NREPS) if MINI: print("GTA Correct\tViral Correct") print("%.2f\t%.2f" % (result[0], result[1]))
def setPublishDate(self,d): self._publishDate = d if Feature._vDate(d) else None def setChangeId(self, changeId):
def metagene_count(): """Chain of command for metagene_count analysis.""" arguments = get_arguments() # confirm BAM file and extract chromosome sizes Read.set_chromosome_sizes(arguments.alignment) ##TODO: create a list of chromosomes to analyze and/or exclude # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM) Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys()) # define has_abundance and has_mappings tags for Read class Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)") Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)") # define the metagene array shape (left padding, start, internal, end, right padding) # metagene = padding ---- internal region ---- padding try: metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding) print "Metagene definition:\t{}".format(metagene) except MetageneError as err: print err raise MetageneError("Unable to create the metagene template") try: Feature.set_format(arguments.feature) # assign file format for the feature file print "Reading feature file as {} format".format(Feature.format) except MetageneError as err: print err raise MetageneError("Unable to create the feature object") # print out the header line... if not arguments.interval_variable: with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file: output_file.write("# Metagene:\t{}\n".format(metagene)) # define for plotting later output_file.write(metagene.print_full()) # for each feature with open(arguments.feature, 'r') as feature_file: for feature_line in read_chunk(feature_file, 1024): if feature_line[0] != "#": # skip comment lines # change creation with feature_method feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing, arguments.ignore_strand) # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather # than Feature.get_chromosome_region() because only the first ensures that the interval does not # extend beyond the length of the chromosome which makes samtools view return no reads (run_pipe_worked, sam_sample) = run_pipe(['samtools view {} {}'.format( arguments.alignment, feature.get_samtools_region())]) if run_pipe_worked: for samline in sam_sample: if len(samline) > 0: # create Read feature (created_read, read) = Read.create_from_sam(samline, Feature.chromosome_conversion.values(), arguments.count_method, arguments.uniquely_mapping, arguments.ignore_strand, arguments.count_secondary_alignments, arguments.count_failed_quality_control, arguments.count_PCR_optical_duplicate, arguments.count_supplementary_alignment) # count read (if it exists) if created_read: feature.count_read(read, arguments.count_method, arguments.count_splicing, arguments.count_partial_reads, arguments.ignore_strand) # output the resulting metagene with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'a') as output_file: output_file.write( "{}\n".format(feature.print_metagene(interval_override=arguments.interval_variable))) else: raise MetageneError("Could not pull chromosomal region {} for feature {} from BAM file {}.".format( feature.get_chromosome_region(), feature.name, arguments.alignment))
if __name__ == '__main__': start = time.time() # Get args parser = get_args() args = parser.parse_args() # Print detail mini = args.mini ### Load training set and make features ### gta_file = args.gta[0] virus_file = args.virus[0] # Load profiles gta_profs = Loader.load(gta_file, "GTA") viral_profs = Loader.load(virus_file, "virus") # Make features feats = Feature(gta_profs.profiles + viral_profs.profiles) if args.kmer == None: kmer_size = args.kmer feats.make_kmer_dict(kmer_size) feats.kmer_feat() if args.pseaac == None: feats.pseaac(lam=int(args.pseaac), weight=PSE_WEIGHT) if args.physico: feats.physicochem() if args.kmer == None and args.pseaac == None and not args.physico: print("You must specify at least one feature type (-k, -p, -y).") else: # Weight if needed if args.weight:
def evaluate(ans, res): total = 0.0 for i in range(len(ans)): total += math.fabs(ans[i] - res[i]) return total if __name__ == '__main__': # Parses arguments if len(sys.argv) != 3 or (sys.argv[1] not in ['pca', 'ae']): print ("Usage:", sys.argv[0], "[dim reduce method (pca/ae)] [dimension]") sys.exit(1) DIM = int(sys.argv[2]) feature = Feature() train_X, train_Y = feature.getYearFeatures(2015) test_X, test_Y = feature.getYearFeatures(2010) print ("All data prepared.") train_X_reduce = None test_X_reduce = None if sys.argv[1] == 'pca': pca = PCA(n_components = DIM) train_X_reduce = np.concatenate(( pca.fit_transform(np.array([ x[0]+x[1] for x in train_X ])), np.array([ x[2] for x in train_X ]) ), axis=1) # Applies same model on test data. test_X_reduce = np.concatenate((
def setup(): work = get_work() config = Config() snappy = Snappy() feature = Feature() globalLength = config.getElementLength() for fan in config.getFans(): snappy.addFan(fan) for blank in config.getBlanks(): snappy.addBlank(blank) for baffle in config.getBaffles(): snappy.addBaffle(baffle) for solid in config.getSolids(): snappy.addSolid(solid) for refinementRegion in config.getRefinementRegions(): snappy.addRefinementRegion(refinementRegion) for geom in config.getSolids() + config.getFans() + config.getBlanks() + config.getBaffles(): localLength = config.getElementLength(geom) refinementLevel = calcRefinementLevel(globalLength, localLength) snappy.setRefinement(refinementLevel, geom) feature.addGeom(geom) for geom in config.getRefinementRegions(): localLength = config.getElementLength(geom) refinementLevel = calcRefinementLevel(globalLength, localLength) snappy.setRegionRefinement(refinementLevel, geom) boundingBox = config.getBoundingBox() dist = config.getBoundingBoxDistance() fluidBoundaries = ([x - dist for x in boundingBox[0:3]] + [x + dist for x in boundingBox[3:6]]) location = [x - .00111 for x in fluidBoundaries[3:6]] snappy.setLocation(location) bmName = os.path.join(work.polyMeshDir(), "blockMeshDict") # template=TemplateFile(bmName + ".template") # template.writeToFile(bmName, {'minx': fluidBoundaries[0], # 'miny': fluidBoundaries[1], # 'minz': fluidBoundaries[2], # 'maxx': fluidBoundaries[3], # 'maxy': fluidBoundaries[4], # 'maxz': fluidBoundaries[5], # 'size': globalLength}) minx = fluidBoundaries[0] miny = fluidBoundaries[1] minz = fluidBoundaries[2] maxx = fluidBoundaries[3] maxy = fluidBoundaries[4] maxz = fluidBoundaries[5] blockMesh = ParsedParameterFile(bmName) blockMesh["vertices"] = [ "(%.6f %.6f %.6f)" % (minx, miny, minz), "(%.6f %.6f %.6f)" % (maxx, miny, minz), "(%.6f %.6f %.6f)" % (maxx, maxy, minz), "(%.6f %.6f %.6f)" % (minx, maxy, minz), "(%.6f %.6f %.6f)" % (minx, miny, maxz), "(%.6f %.6f %.6f)" % (maxx, miny, maxz), "(%.6f %.6f %.6f)" % (maxx, maxy, maxz), "(%.6f %.6f %.6f)" % (minx, maxy, maxz)] numx = int((maxx-minx)/globalLength) numy = int((maxy-miny)/globalLength) numz = int((maxz-minz)/globalLength) blockMesh["blocks"][2] = "(%d %d %d)" % (numx, numy, numz) blockMesh.writeFile()
if __name__ == '__main__': parser = argparse.ArgumentParser( description='Baseline method for MD final project.' ) parser.add_argument('--year', '-y', help='Year to test', dest='year', type=int, required=True) parser.add_argument('--node-threshold', '-n', help='Node features # threshold', dest='node_threshold', type=int, default=150) parser.add_argument('--migration-threshold', '-m', help='Migrants non-zero # threshold', dest='migrat_threshold', type=int, default=40) args = parser.parse_args(sys.argv[1:]) feature = Feature(args.node_threshold, args.migrat_threshold) ans = baseline(feature, args.year) # Getting real answer. real = feature.getValidation(args.year) real_ans = [] for tar, iy in feature.country_index.items(): for src, ix in feature.country_index.items(): real_ans.append(real[(src, tar)]) N_country = len(feature.country_index) error = evaluate(real_ans, ans) print ("Baseline error:", error) print ("Average error:", error / len(real_ans))
def __init__(self, *args, **kwargs): Feature.__init__(self) self.persons = kwargs['persons']
import numpy as np import sys #sys.path.append('data/validation') from Feature import Feature from pca import DR_PCA from math import sqrt, fabs from cvxopt import matrix, spmatrix, solvers import AE dim = input('dim = ') migration_threshold = input('migration_threshold = ') op = input('type 0 to use PCA, 1 to use AE : ') f = Feature(node_threshold=180, migration_threshold=migration_threshold) X, Y = f.getYearFeatures(2015) X0 = [] X1 = [] for i in range(len(X)): X0.append(X[i][0]) X1.append(X[i][1]) if op == 0: x0 = DR_PCA(X0, dim) x1 = DR_PCA(X1, dim) else: X0 = np.array(X0) X1 = np.array(X1) x, w, b = AE.dim_reduce(X0, dim, 2, 20, 0.01) x0 = AE.forward2hidden(X0, w, b, 2) x, w, b = AE.dim_reduce(X1, dim, 2, 20, 0.01) x1 = AE.forward2hidden(X1, w, b, 2) x = [] for i in range(len(X)):
def __init__(self, *args, **kwargs): Feature.__init__(self) self.stemmer = Stemmer('english') self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789') self.stopList = frozenset(['a', 'abaft', 'aboard', 'about', 'abov', 'absent', 'accord', 'account', 'across', 'addit', 'afor', 'after', 'against', 'ago', 'ahead', 'all', 'along', 'alongsid', 'alreadi', 'also', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'anenst', 'ani', 'anoth', 'anybodi', 'anyhow', 'anyon', 'anyth', 'anywher', 'apart', 'apr', 'april', 'apropo', 'apud', 'are', 'around', 'as', 'asid', 'astrid', 'at', 'athwart', 'atop', 'aug', 'august', 'back', 'bad', 'bar', 'be', 'becaus', 'been', 'befor', 'begin', 'behalf', 'behest', 'behind', 'below', 'beneath', 'besid', 'best', 'better', 'between', 'beyond', 'big', 'bigger', 'biggest', 'billion', 'blah', 'bln', 'both', 'but', 'by', 'c', 'ca', 'call', 'can', 'cannot', 'cant', 'case', 'circa', 'close', 'concern', 'could', 'couldt', 'current', 'daili', 'day', 'dec', 'decemb', 'despit', 'did', 'do', 'doe', 'doesnt', 'done', 'dont', 'down', 'due', 'dure', 'each', 'eight', 'eighteen', 'eighth', 'eighti', 'eleven', 'end', 'enough', 'ever', 'except', 'exclud', 'fail', 'far', 'feb', 'februari', 'few', 'fifth', 'first', 'five', 'fiveteen', 'fivti', 'follow', 'for', 'forenenst', 'four', 'fourteen', 'fourth', 'fourti', 'fri', 'friday', 'from', 'front', 'full', 'further', 'get', 'given', 'go', 'gone', 'goot', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'he', 'her', 'here', 'herself', 'high', 'higher', 'hightst', 'himself', 'his', 'how', 'hunderd', 'i', 'if', 'in', 'includ', 'insid', 'instead', 'into', 'is', 'it', 'itself', 'jan', 'januari', 'jul', 'juli', 'jun', 'june', 'just', 'last', 'late', 'later', 'latest', 'left', 'lest', 'lieu', 'like', 'littl', 'long', 'low', 'lower', 'lowest', 'made', 'make', 'mani', 'mar', 'march', 'may', 'me', 'mean', 'mid', 'midst', 'might', 'milliard', 'million', 'mine', 'minus', 'mld', 'mln', 'modulo', 'mon', 'monday', 'month', 'more', 'most', 'mth', 'much', 'must', 'my', 'myself', 'near', 'need', 'neednt', 'neither', 'never', 'next', 'nine', 'nineteen', 'nineth', 'nineti', 'no', 'none', 'nor', 'not', 'notwithstand', 'nov', 'novemb', 'number', 'o', 'oct', 'octob', 'of', 'off', 'on', 'one', 'onli', 'onto', 'oppos', 'opposit', 'or', 'order', 'other', 'ought', 'our', 'ourselv', 'out', 'outsid', 'over', 'owe', 'pace', 'past', 'per', 'place', 'plus', 'point', 'previous', 'prior', 'pro', 'pursuant', 'put', 'qua', 'rather', 'recent', 'regard', 'regardless', 'respect', 'right', 'round', 'said', 'sake', 'same', 'san', 'sat', 'saturday', 'save', 'saw', 'say', 'second', 'see', 'seen', 'sep', 'septemb', 'seven', 'seventeen', 'seventh', 'seventi', 'sever', 'shall', 'she', 'should', 'shouldnt', 'show', 'shown', 'sinc', 'six', 'sixteen', 'sixth', 'sixti', 'small', 'smaller', 'smallest', 'so', 'some', 'somebodi', 'somehow', 'someon', 'someth', 'somewher', 'soon', 'sooner', 'spite', 'start', 'still', 'subsequ', 'such', 'sun', 'sunday', 'take', 'taken', 'tell', 'ten', 'tenth', 'than', 'thank', 'that', 'the', 'their', 'them', 'themselv', 'there', 'these', 'they', 'third', 'thirteen', 'thirti', 'this', 'those', 'thousand', 'three', 'through', 'throughout', 'thru', 'thruout', 'thu', 'thursday', 'till', 'time', 'to', 'today', 'told', 'too', 'took', 'top', 'toward', 'tue', 'tuesday', 'twelv', 'twenti', 'two', 'under', 'underneath', 'unit', 'unlik', 'until', 'unto', 'up', 'upon', 'us', 'use', 'versus', 'via', 'vice', 'view', 'virtu', 'vis', 'visavi', 'vs', 'was', 'we', 'wed', 'wednesday', 'week', 'well', 'went', 'were', 'what', 'when', 'where', 'whether', 'whi', 'which', 'while', 'who', 'whose', 'will', 'with', 'within', 'without', 'wont', 'wors', 'worst', 'worth', 'would', 'wrt', 'xor', 'year', 'yes', 'yesterday', 'yet', 'you', 'your', 'yourself', 'yourselv', 'yr'])
def setEmail(self, email): self._email = email if Feature._vEmail(email) else None
def __init__(self, *args, **kwargs): Feature.__init__(self)
def setup(): """Create fixtures""" # Define chromosome sizes Read.extract_chromosome_sizes(["@HD\tVN:1.0\tSO:unsorted", "@SQ\tSN:chr1\tLN:300", "@SQ\tSN:chr2\tLN:200", "@PG\tID:test\tVN:0.1"]) Feature.process_set_chromosome_conversion(["1\tchr1", "2\tchr2"]) good_input["bed input counting all of the read"] = ("all", "[17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]") good_input["bed input counting start of the read"] = ("start", "[17, 18, 19, 20, 21, 22, 23]") good_input["bed input counting end of the read"] = ("end", "[36, 37, 38, 39, 40, 41, 42]") good_input["gff input counting all of the read"] = ("all", "[43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8]") good_input["gff input counting start of the read"] = ("start", "[43, 42, 41, 40, 39, 38, 37]") good_input["gff input counting end of the read"] = ("end", "[14, 13, 12, 11, 10, 9, 8]") for method in ['all', 'start', 'end']: print "\nTesting feature_count option: ****{}****".format(method) if method == 'all': metagene = Metagene(10, 4, 2) print "\t with Metagene:\t{}".format(metagene) print "\t with chromosome conversions:\t{}".format(Feature.chromosome_conversion) else: metagene = Metagene(1, 4, 2) print "\t with Metagene:\t{}".format(metagene) print "\t with chromosome conversions:\t{}".format(Feature.chromosome_conversion) # create feature from BED line try: bedline = "{}\t{}\t{}\t{}\t{}\t{}\n".format(1, 20, 40, "first", 44, "+") print "\t with BED line:\t{}".format(bedline.strip()) feature1 = Feature.create_from_bed(method, metagene, bedline, False, False) if str(feature1.position_array) != correct_features['bed'][method]: print "**FAILED**\t Create Feature from BED line ?" print "\t Desired positions:\t{}".format(correct_features['bed'][method]) print "\t Created positions:\t{}".format(feature1.position_array) except MetageneError as err: print "**FAILED**\t Create Feature from BED line ?" else: print "PASSED\t Create Feature from BED line ?\t\t{}".format(feature1.get_chromosome_region()) # create feature from GFF line try: gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(2, "test", "gene", 10, 39, ".", "-", ".", "second") print "\t with GFF line:\t{}".format(gffline.strip()) feature2 = Feature.create_from_gff(method, metagene, gffline, False, False) if str(feature2.position_array) != correct_features['gff'][method]: print "**FAILED**\t Create Feature from GFF line ?\t**FAIL**" print "\t Desired positions:\t{}".format(correct_features['gff'][method]) print "\t Created positions:\t{}".format(feature2.position_array) except MetageneError as err: print "**FAILED**\t Create Feature from GFF line ?" else: print "PASSED\t Create Feature from GFF line ?\t\t{}".format(feature2.get_chromosome_region()) # create feature from GFF line with start and end swapped try: gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(2, "test", "gene", 39, 10, ".", "-", ".", "second") print "\t with GFF line:\t{}".format(gffline.strip()) feature2 = Feature.create_from_gff(method, metagene, gffline, False, False) if str(feature2.position_array) != correct_features['gff'][method]: print "**FAILED**\t Create Feature from GFF line with swapped start and end ?\t**FAIL**" print "\t Desired positions:\t{}".format(correct_features['gff'][method]) print "\t Created positions:\t{}".format(feature2.position_array) except MetageneError as err: print "**FAILED**\t Create Feature from GFF line with swapped start and end ?" else: print "PASSED\t Create Feature from GFF line with swapped start and end ?\t\t{}".format( feature2.get_chromosome_region()) try: gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(2, "test", "gene", 39, 10, ".", "+", ".", "second") print "\t with GFF line:\t{}".format(gffline.strip()) feature2 = Feature.create_from_gff(method, metagene, gffline, False, False) if str(feature2.position_array) != correct_features['gff'][method]: print "**FAILED**\t Do not create Feature from GFF line with swapped start and end, + strand ?\t**FAIL**" print "\t Desired positions:\t{}".format(correct_features['gff'][method]) print "\t Created positions:\t{}".format(feature2.position_array) except MetageneError as err: print "PASSED\t Do not create Feature from GFF line with swapped start and end, + strand ?\t\t{}".format( err) else: print "**FAILED**\t Do not create Feature from GFF line with swapped start and end, + strand ?\t\t{}".format( feature2.get_chromosome_region()) ##TODO finish complete testing of Feature class print "\n##TODO finish testing of Feature class creation\n" print "\n**** Testing counting and maniputlation ****\n" expected = {'all': {}, 'start': {}, 'end': {}} # Positions in metagene: 17 18 19 20 21-22,23-24,25-26,27-28,29-30,31-32,33-34,35-36,37-38,39-40, 41, 42 expected['all'] = { 'all': "first,sense:allreads,0.333,0.333,0.000,0.000,0.000,0.000,0.000,0.000,0.286,0.571,0.571,0.000,0.000,0.286,0.286,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.100,0.100,0.100,0.100,0.100,0.000,0.000,0.000,0.000,0.000,0.111", 'start': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.500,0.000,0.000,0.000,0.000,0.000,0.000", 'end': "first,sense:allreads,0.000,3.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.500,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000"} # Positions in metagene: 17 18 19 20 [21] 22 23 expected['start'] = { 'all': "first,sense:allreads,0.333,0.333,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.050", 'start': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000", 'end': "first,sense:allreads,0.000,3.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.500"} # Positions in metagene: 36 37 38 39 [40] 41 42 expected['end'] = { 'all': "first,sense:allreads,0.000,0.000,0.000,0.000,0.286,0.286,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.111", 'start': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000", 'end': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,2.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,1.000"} metagene = {'all': Metagene(10, 4, 2), 'start': Metagene(1, 4, 2), 'end': Metagene(1, 4, 2)} for method in ['all', 'start', 'end']: if method == 'all': print "\t with Metagene:\t{}".format(metagene[method]) print "\t with chromosome conversions:\t{}".format(Feature.chromosome_conversion) else: print "\t with Metagene:\t{}".format(metagene[method]) print "\t with chromosome conversions:\t{}".format(Feature.chromosome_conversion) print "\nTesting feature_count option: ****{}****".format(method) feature_line = "{}\t{}\t{}\t{}\t{}\t{}\n".format(1, 20, 40, "first", 44, "+") feature1 = Feature.create_from_bed(method, metagene[method], feature_line, False, False) print "\tFeature:\t{}".format(feature1.position_array) reads = [] reads.append(Read("chr1", "+", 3, 1, [10, 11, 12, 13, 14, 15, 16, 17, 18])) reads.append(Read("chr1", "-", 1, 2, [23, 24, 25, 26, 27, 28, 29, 30, 31, 32])) reads.append(Read("chr1", "+", 4, 2, [30, 31, 32, 33, 34, 40, 41])) reads.append(Read("chr1", "-", 1, 1, [42, 43, 44, 45, 46, 47, 48, 49, 50])) reads.append(Read("chr1", "+", 10, 1, [51, 52, 53, 54, 55])) reads.append(Read("chr2", "+", 10, 1, [18, 19, 20, 21, 22, 23, 24, 25])) # starting count for count_method in ['all', 'start', 'end']: print "\nTesting count_method option: ****{}****".format(count_method) output = "{}\n".format(feature1) for r in reads: output += "{}\n".format(r) feature1.count_read(r, count_method, count_partial_reads=True) output += "{}\n".format(feature1) output += feature1.print_metagene(pretty=True) if str(feature1.print_metagene()).strip() == str(expected[method][count_method]).strip(): print "PASSED\tCreated correct metagene with feature method {} and count method {} ?".format(method, count_method) else: print "**FAILED**\tCreated correct metagene with feature method {} and count method {} ?".format(method, count_method) print "\tExpected:\n{}".format(expected[method][count_method]) print "\tActual :\n{}".format(feature1.print_metagene()) print "\tSummary of run:\n{}".format(output) feature1 = Feature.create_from_bed(method, metagene[method], feature_line, False, False) # zero out counter for next round try: unstranded_read = Read("chr1", ".", 10, 1, [18, 19, 20, 21, 22, 23, 24, 25]) feature1.count_read(unstranded_read, 'all') except MetageneError as err: print "PASSED\tCaught unstranded read on stranded count ?\t\t".format(err) else: print "**FAILED**\tCaught unstranded read on stranded count ?" try: feature_line = "{}\t{}\t{}\t{}\t{}\t{}\n".format(1, 20, 40, "first", 44, ".") feature1 = Feature.create_from_bed(method, metagene[method], feature_line, False, False) unstranded_read = Read("chr1", ".", 10, 1, [18, 19, 20, 21, 22, 23, 24, 25]) feature1.count_read(unstranded_read, 'all') except MetageneError as err: print "**FAILED**\tAllowed unstranded read on unstranded count ?\t\t".format(err) else: print "PASSED\tAllowed unstranded read on unstranded count ?" print "\n**** Testing adjust_to_metagene ****\n" chromosome_converter = {"1": "chr1", "2": "chr2"} # ((metagene_tupple),(feature_tupple),expected_result_string, message_string) tests = [((8, 2, 2), (16, 8, 24, 4), '8.000,8.000,4.000,4.000,12.000,12.000,2.000,2.000', "Expand to metagene ?"), ((4, 2, 2), (6, 8, 6, 2, 4, 4, 2, 4, 24, 8), '17.000,9.000,8.000,34.000', "Contract to metagene ?"), ((4, 2, 2), (2.5, 4, (10.0 / 3), 10, 11, 7.3, 4), '5.500,9.333,17.825,9.475', "Contract with messy floats ?"), ((3, 2, 2), (2.5, 4, (10.0 / 3), 10, 11, 7.3, 4), '7.611,19.556,14.967', "Contract with other messy floats ?")] for t in tests: metagene = Metagene(*t[0]) print "\t{}".format(metagene) feature_line = "{}\t{}\t{}\n".format(1, 0, len(t[1])) feature = Feature.create_from_bed('all', metagene, feature_line, False, False, short=True) adjusted_feature = "" for f in feature.adjust_to_metagene(t[1]): adjusted_feature += "{0:0.3f},".format(f) if adjusted_feature[:-1] == t[2]: print "PASSED\t{}".format(t[3]) else: print "**FAILED**\t{}".format(t[3]) print "\tExpected:\t{}".format(t[2]) print "\tActual :\t{}".format(adjusted_feature[:-1]) print "\tOriginal:\t{}".format(feature.adjust_to_metagene(t[1])) print "\n**** End of Testing the Feature class ****\n" # end of Feature.test method