def main(): args = args_parse() out1 = open("%s.TE.fa"%str(args.fasta).split(".")[0],"w") out2 = open("%s.CON.fa"%str(args.fasta).split(".")[0],"w") seqslen = get_len(args.fasta) seqs = get_seqs(args.fasta) TE = get_TE_ids(args.TE) pfam_ids = get_pfam(args.infile) TE_ids = [] for line in open(args.infile,"r"): if "#" not in line and len(line.strip().split()) == 15 : ids = line.strip().split()[0] start = int(line.strip().split()[1]) end = int(line.strip().split()[2]) hmm = line.strip().split()[5] if hmm.split(".")[0] in TE: per = (float(end)-float(start)+float(1))/float(seqslen.get(ids)) if per >= float(0.3): TE_ids.append(ids) out1.write(">%s\t%f\n%s\n"%(ids,per,seqs.get(ids))) set1 = set(TE_ids) set2 = set(seqs.keys()) for retain_ids in set2 - set1: try: out2.write(">%s\t%s\n%s\n"%(retain_ids,pfam_ids[retain_ids],seqs.get(retain_ids))) except KeyError: out2.write(">%s\tNon-domains\n%s\n"%(retain_ids,seqs.get(retain_ids)))
def route(self, minz): try: self.paths = [] self.sub_terminal_collision_lines() visited = set() for index in xrange(1, len(self.terminals)): visited |= set([(int(self.terminals[index - 1][2][0]+0.5), int(self.terminals[index - 1][2][1]+0.5), z) for z in xrange(self.pcb.depth)]) ends = [(int(self.terminals[index][2][0]+0.5), int(self.terminals[index][2][1]+0.5), z) for z in xrange(self.pcb.depth)] self.pcb.mark_distances(self.pcb.routing_flood_vectors, self.radius, self.via, self.gap, visited, ends) ends = [(self.pcb.get_node(node), node) for node in ends] ends.sort() _, end = ends[0] path = [end] while path[-1] not in visited: nearer_nodes = self.pcb.all_not_shorting(self.pcb.all_nearer_sorted, \ (self.pcb.routing_path_vectors, path[-1], end, self.pcb.dfunc), path[-1], self.radius, self.via, self.gap) next_node = next(nearer_nodes) if minz: for node in nearer_nodes: if node[2] == path[-1][2]: next_node = node break path.append(next_node) visited |= set(path) self.paths.append(path) self.pcb.unmark_distances() self.paths = self.optimise_paths(self.paths) self.add_paths_collision_lines() self.add_terminal_collision_lines() return True except StopIteration: self.pcb.unmark_distances() self.remove() return False
def _get_router_ids_for_agent(self, context, agent_db, router_ids): result_set = set(super(L3_DVRsch_db_mixin, self)._get_router_ids_for_agent( context, agent_db, router_ids)) router_ids = set(router_ids or []) if router_ids and result_set == router_ids: # no need for extra dvr checks if requested routers are # explicitly scheduled to the agent return list(result_set) # dvr routers are not explicitly scheduled to agents on hosts with # dvr serviceable ports, so need special handling if (self._get_agent_mode(agent_db) in [n_const.L3_AGENT_MODE_DVR, n_const.L3_AGENT_MODE_DVR_NO_EXTERNAL, n_const.L3_AGENT_MODE_DVR_SNAT]): if not router_ids: result_set |= set(self._get_dvr_router_ids_for_host( context, agent_db['host'])) else: for router_id in (router_ids - result_set): subnet_ids = self.get_subnet_ids_on_router( context, router_id) if (subnet_ids and self._check_dvr_serviceable_ports_on_host( context, agent_db['host'], list(subnet_ids))): result_set.add(router_id) return list(result_set)
def neargroups(self, blocknames): """Given a list or set of block names, finds groups of 'near' blocks. Blocks are assigned the same group if they are neighbours, or share a neighbour.""" blocknames = list(set(blocknames)) groups = [] for blk in blocknames: groups.append(set([blk])) from copy import copy done = False while not done: done = True for i, g in enumerate(groups): ng = copy(g) for blk in g: ng = ng | self.block[blk].neighbour_name if i < len(groups) - 1: for g2 in groups[i + 1 :]: ng2 = copy(g2) for blk in g2: ng2 = ng2 | self.block[blk].neighbour_name if ng & ng2: g.update(g2) groups.remove(g2) done = False break if not done: break return groups
def testStandingsBeforeMatches(): """ Test to ensure players are properly represented in standings prior to any matches being reported. """ deleteMatches() deletePlayers() registerPlayer("Melpomene Murray") registerPlayer("Randy Schwartz") standings = playerStandings() if len(standings) < 2: raise ValueError("Players should appear in playerStandings even before " "they have played any matches.") elif len(standings) > 2: raise ValueError("Only registered players should appear in standings.") if len(standings[0]) != 4: raise ValueError("Each playerStandings row should have four columns.") [(id1, name1, wins1, matches1), (id2, name2, wins2, matches2)] = standings if matches1 != 0 or matches2 != 0 or wins1 != 0 or wins2 != 0: raise ValueError( "Newly registered players should have no matches or wins.") if set([name1, name2]) != set(["Melpomene Murray", "Randy Schwartz"]): raise ValueError("Registered players' names should appear in standings, " "even if they have no matches played.") print "6. Newly registered players appear in the standings with no matches."
def __init__(self, data_dir_name): ''' Read meta-data for test data and set table to load based on a given order or on schema files in input data @param data_dir_name: path to directory containing test data and configuration files ''' self.log = logging.getLogger(__name__) self.dataDir = data_dir_name _topLevelConfigFile = os.path.join(self.dataDir, "description.yaml") with io.open(_topLevelConfigFile, 'r') as f: self.update(yaml.load(f)) self.log.debug("Data configuration : %s" % self) fromFileTables = self._tableFromSchemaFile() # a specific load order on a restricted number of tables # ca be specified in yaml if not self['tables'].get('load-order'): self['tables']['load-order'] = fromFileTables self.notLoadedTables = [] else: self.notLoadedTables = list(set(fromFileTables) - set(self.orderedTables)) self.log.debug("Tables to load : %s", self.orderedTables)
def _load(self, src, text_src): if isinstance(src, PredictionResult): pass elif isinstance(src, str): result = PredictionResult() result.load(src) else: raise Exception('"result" should be PredictionResult or string.') if not result.analyzable(): raise ValueError('The given result is not analyzable.') # +++ Need to move to another place. #if self.model._hashcode != result.model_id: # sys.stderr.write('Warning: model ID is different from that in the predicted result. Do you use a different model to analyze?\n') if text_src is None: self.filepath = result.text_src else: self.filepath = text_src self.extra_svm_files = result.extra_svm_files predicted_y = result.predicted_y self.acc = result.get_accuracy() decvals = result.decvals true_y = result.true_y self.insts, self.true_labels, self.predict_labels = [], set(), set() for idx in range(len(true_y)): self.insts += [TextInstance(idx, true_y = true_y[idx], predicted_y = predicted_y[idx], decvals = list(decvals[idx]))] self.true_labels.add(true_y[idx]) self.predict_labels.add(predicted_y[idx])
def _init_settings(cls, matrix): # Проверяем наличие ошибок неправильного заполнения таблицы свойств if not cls._check_shape(fact=matrix.shape, req=cls._required_settings_shape): raise WrongShapeException(fact=matrix.shape, req=cls._required_settings_shape, name="Проверка размерности таблицы с общими настройками", aud=cls.outer_name) if not cls._check_nans(fact=matrix): raise NansInMatrixException(name="Проверка наличия отсутствующих значений в общих настройках", aud=cls.outer_name) # Чтобы проверить саму табличку надо проделать несколько махинаций, ведь по умолчанию все в виде матриц settings = pd.DataFrame(matrix[1:], columns=matrix[0]) settings.columns = cls._standard_settings_column_names settings.set_index("key", inplace=True) # Проверяем все ли настнойки внесены в табличку if not cls._check_settings(fact=set(settings.index), req=cls._required_settings_options): raise NotEnoughSettings(fact=set(settings.index), req=cls._required_settings_options, name="Проверка вхождения всех необходимых\ переменных по ключу в общих настройках", aud=cls.outer_name) # Проверяем, что это именно то, что мы ожидали получить на входе if not cls._check_values_condition(fact=settings["code"].to_dict(), req=cls._required_settings_values_condition): raise ValuesConditionException(fact=settings["code"].to_dict(), req=cls._required_settings_values_condition, name="Проверка валидности ввода настроек в таблицу с общими настройками", aud=cls.outer_name) cls.settings = settings["code"].to_dict()
def __init__(self, deviceRef): # Check that we've got a valid IOHIDDevice. assert(deviceRef) assert(cf.CFGetTypeID(deviceRef) == iokit.IOHIDDeviceGetTypeID()) _device_lookup[deviceRef.value] = self self.deviceRef = deviceRef # Set attributes from device properties. self.transport = self.get_property("Transport") self.vendorID = self.get_property("VendorID") self.vendorIDSource = self.get_property("VendorIDSource") self.productID = self.get_property("ProductID") self.versionNumber = self.get_property("VersionNumber") self.manufacturer = self.get_property("Manufacturer") self.product = self.get_property("Product") self.serialNumber = self.get_property("SerialNumber") # always returns None; apple bug? self.locationID = self.get_property("LocationID") self.primaryUsage = self.get_property("PrimaryUsage") self.primaryUsagePage = self.get_property("PrimaryUsagePage") # Populate self.elements with our device elements. self.get_elements() # Set up callback functions. self.value_observers = set() self.removal_observers = set() self.register_removal_callback() self.register_input_value_callback()
def get_adjacency_lists(in_file): edges = {} verts = {} edge_count = 0 with open(in_file) as f: for line in f.readlines(): vertex = line.split() v1 = int(vertex[0]) for v2_s in vertex[1:]: v2 = int(v2_s) if v2 > v1: # avoid adding duplicated edges in the loaded graph try: verts[v1].add(edge_count) # edges in v1 except KeyError: verts[v1] = set() verts[v1].add(edge_count) try: verts[v2].add(edge_count) # edges in v2 except KeyError: verts[v2] = set() verts[v2].add(edge_count) edges[edge_count] = [v1, v2] edge_count += 1 return edges, verts
def get_analysis_analysisID_dataStage01ResequencingAnalysis(self,analysis_id_I): '''Query rows that are used from the analysis''' try: data = self.session.query(data_stage01_resequencing_analysis).filter( data_stage01_resequencing_analysis.analysis_id.like(analysis_id_I), data_stage01_resequencing_analysis.used_.is_(True)).all(); analysis_id_O = [] experiment_id_O = [] lineage_name_O = [] sample_name_O = [] analysis_type_O = [] analysis_O = {}; if data: for d in data: analysis_id_O.append(d.analysis_id); experiment_id_O.append(d.experiment_id); lineage_name_O.append(d.lineage_name); sample_name_O.append(d.sample_name); analysis_type_O.append(d.analysis_type); analysis_id_O = list(set(analysis_id_O)) experiment_id_O = list(set(experiment_id_O)) lineage_name_O = list(set(lineage_name_O)) sample_name_O = list(set(sample_name_O)) analysis_type_O = list(set(analysis_type_O)) analysis_O={ 'analysis_id':analysis_id_O, 'experiment_id':experiment_id_O, 'lineage_name':lineage_name_O, 'sample_name':sample_name_O, 'analysis_type':analysis_type_O}; return analysis_O; except SQLAlchemyError as e: print(e);
def get_all_group_lines(import_groups): if not import_groups: return [] def get_group_lines(group): def comparator(x, y): # These shenanigans are used to properly order imports for inner classes. # So we get ordering like: # import com.foo.Bar; # import com.foo.Bar.Baz; # (this is not lexicographical, so normal sort won't suffice) x_m = IMPORT_CLASS_RE.match(x) y_m = IMPORT_CLASS_RE.match(y) if x_m.group('outer') == y_m.group('outer'): return cmp(x_m.group('inners'), y_m.group('inners')) else: return cmp(x, y) lines = sorted(import_groups[group], comparator) lines.append('') return lines all_lines = [] explicit_groups = ['java', 'javax', 'scala', 'com', 'net', 'org'] for group in explicit_groups: if group in import_groups: all_lines += get_group_lines(group) # Gather remaining groups. remaining_groups = sorted(set(import_groups.keys()) - set(explicit_groups)) for group in remaining_groups: all_lines += get_group_lines(group) return all_lines
def __init__(self, config): self.populate_logger() self.config = config mozinfo.find_and_update_from_json(config.topobjdir) # Python 2.6 doesn't allow unicode keys to be used for keyword # arguments. This gross hack works around the problem until we # rid ourselves of 2.6. self.info = {} for k, v in mozinfo.info.items(): if isinstance(k, unicode): k = k.encode('ascii') self.info[k] = v self._libs = OrderedDefaultDict(list) self._binaries = OrderedDict() self._linkage = [] self._static_linking_shared = set() # Keep track of external paths (third party build systems), starting # from what we run a subconfigure in. We'll eliminate some directories # as we traverse them with moz.build (e.g. js/src). subconfigures = os.path.join(self.config.topobjdir, 'subconfigures') paths = [] if os.path.exists(subconfigures): paths = open(subconfigures).read().splitlines() self._external_paths = set(mozpath.normsep(d) for d in paths) # Add security/nss manually, since it doesn't have a subconfigure. self._external_paths.add('security/nss')
def decorated(self, **kwargs): """A wrapped test method that treats some arguments in a special way.""" mode = kwargs.pop("mode", "graph") distribution = kwargs.get("distribution", None) required_tpu = kwargs.pop("required_tpu", False) required_gpus = kwargs.pop("required_gpus", None) if distribution: assert required_gpus is None, ( "Do not use `required_gpus` and `distribution` together.") assert required_tpu is False, ( "Do not use `required_tpu` and `distribution` together.") required_gpus = distribution.required_gpus required_tpu = distribution.required_tpu if required_tpu and not TPU_TEST: self.skipTest("Test requires a TPU, but it's not available.") if not required_tpu and TPU_TEST: self.skipTest("Test that doesn't require a TPU.") if not required_gpus: if GPU_TEST: self.skipTest("Test that doesn't require GPUs.") elif context.num_gpus() < required_gpus: self.skipTest( "{} GPUs are not available for this test. {} GPUs are available". format(required_gpus, context.num_gpus())) # At this point, `kwargs` doesn't have `required_gpus` or `required_tpu` # that the user might have specified. `kwargs` still has `mode`, which # the test is allowed to accept or ignore. requested_arguments = tf_inspect.getfullargspec(test_method).args missing_arguments = set(list(kwargs.keys()) + ["self"]).difference( set(requested_arguments + ["mode"])) if missing_arguments: raise ValueError("The test is missing arguments {} .".format( missing_arguments)) kwargs_to_pass = {} for arg in requested_arguments: if arg == "self": kwargs_to_pass[arg] = self else: kwargs_to_pass[arg] = kwargs[arg] if mode == "eager": with ops.Graph().as_default(), context.eager_mode(): if distribution: kwargs_to_pass["distribution"] = distribution.strategy test_method(**kwargs_to_pass) elif mode == "graph": with ops.Graph().as_default(), context.graph_mode(): if distribution: kwargs_to_pass["distribution"] = distribution.strategy test_method(**kwargs_to_pass) else: raise ValueError( "'mode' has to be either 'eager' or 'graph' and not {}".format( mode))
def times(*combined): """Generate a product of N sets of combinations. times(combine(a=[1,2]), combine(b=[3,4])) == combine(a=[1,2], b=[3,4]) Args: *combined: N lists of dictionaries that specify combinations. Returns: a list of dictionaries for each combination. Raises: ValueError: if some of the inputs have overlapping keys. """ assert combined if len(combined) == 1: return combined[0] first = combined[0] rest_combined = times(*combined[1:]) combined_results = [] for a in first: for b in rest_combined: if set(a.keys()).intersection(set(b.keys())): raise ValueError("Keys need to not overlap: {} vs {}".format( a.keys(), b.keys())) combined_results.append(OrderedDict(list(a.items()) + list(b.items()))) return combined_results
def _get_episode_search_strings(self, ep_obj, add_string=''): search_string = {'Episode': []} if not ep_obj: return [] if self.show.air_by_date: for show_name in set(show_name_helpers.allPossibleShowNames(self.show)): ep_string = sanitizeSceneName(show_name) + '.' + \ str(ep_obj.airdate).replace('-', '|') search_string['Episode'].append(ep_string) elif self.show.sports: for show_name in set(show_name_helpers.allPossibleShowNames(self.show)): ep_string = sanitizeSceneName(show_name) + '.' + \ str(ep_obj.airdate).replace('-', '|') + '|' + \ ep_obj.airdate.strftime('%b') search_string['Episode'].append(ep_string) elif self.show.anime: for show_name in set(show_name_helpers.allPossibleShowNames(self.show)): ep_string = sanitizeSceneName(show_name) + '.' + \ "%i" % int(ep_obj.scene_absolute_number) search_string['Episode'].append(ep_string) else: for show_name in set(show_name_helpers.allPossibleShowNames(self.show)): ep_string = show_name_helpers.sanitizeSceneName(show_name) + '.' + \ sickbeard.config.naming_ep_type[2] % {'seasonnumber': ep_obj.scene_season, 'episodenumber': ep_obj.scene_episode} + ' %s' % add_string search_string['Episode'].append(re.sub('\s+', '.', ep_string)) return [search_string]
def plot_overtime(data_file): data = performance.load_score_dict(data_file) avg_sim = [] std_sim = [] # Lets compute the average fraction of matching paths for each case for index, time_step in enumerate(data): if index == 0: continue prev_step = data[index - 1] sim_list = [] for pair_index, pair in enumerate(time_step): curr_chain = set([x[0] for x in pair]) print curr_chain prev_chain = set([x[0] for x in prev_step[pair_index]]) if len(curr_chain) == 0 or len(prev_chain) == 0: continue sim = float(len(curr_chain & prev_chain)) / len(curr_chain) sim_list.append(sim) avg_sim.append(np.mean(sim_list)) std_sim.append(np.std(sim_list)) print "Next Time Step!" plotting.overtime_plot(avg_sim, std_sim)
def ind_complement(v, ind): if isinstance(ind, _INDEXTYPES): ind = [ind] elif type(ind) is slice: ind = range(*ind.indices(len(v))) l = len(v) return sorted(set(range(l)) - set(i if i >= 0 else l+i for i in ind))
def getHoster(self): # If no accounts are available there will be no hosters available if not self.account or not self.account.canUse(): return [] # Get account data (user, data) = self.account.selectAccount() # Get supported hosters list from premiumize.me using the json API v1 (see https://secure.premiumize.me/?show=api) answer = getURL("https://api.premiumize.me/pm-api/v1.php?method=hosterlist¶ms[login]=%s¶ms[pass]=%s" % (user, data['password'])) data = json_loads(answer) # If account is not valid thera are no hosters available if data['status'] != 200: return [] # Extract hosters from json file hosters = set(data['result']['hosterlist']) # Read config to check if certain hosters should not be handled configMode = self.getConfig('hosterListMode') if configMode in ("listed", "unlisted"): configList = set(self.getConfig('hosterList').strip().lower().replace('|',',').replace(';',',').split(',')) configList.discard(u'') if configMode == "listed": hosters &= configList else: hosters -= configList return list(hosters)
def _dict_diff(a, b): """A one way dictionary diff. a: a dictionary b: a dictionary Returns: True if the dictionaries are different """ # Only things the master has which the slave lacks matter if set(a.keys()) - set(b.keys()): LOG.debug('metadata diff -- master has extra keys: %(keys)s', {'keys': ' '.join(set(a.keys()) - set(b.keys()))}) return True for key in a: if str(a[key]) != str(b[key]): LOG.debug('metadata diff -- value differs for key ' '%(key)s: master "%(master_value)s" vs ' 'slave "%(slave_value)s"', {'key': key, 'master_value': a[key], 'slave_value': b[key]}) return True return False
def calculateSparseDictCOO(data_set, data_label_hash, jump=1, valid_flag=False): row = [] col = [] data = [] row_valid = [] col_valid = [] data_valid = [] doc_ids = set(sorted(map(lambda row:int(row[0]), data_set))) base_ids_list = filter(lambda ids: ids % jump == 0, doc_ids) train_ids = base_ids_list valid_ids = set() if valid_flag: valid_index = filter(lambda ids: ids % validation_perc == 0, range(len(base_ids_list))) valid_ids = [base_ids_list[i] for i in valid_index] base_ids = set(base_ids_list) train_ids = sorted(base_ids - set(valid_ids)) labels = map(lambda trid: int(data_label_hash[trid]), train_ids) labels_valid = map(lambda vlid: int(data_label_hash[vlid]), valid_ids) for i in range(len(data_set)): if int(data_set[i][0]) in train_ids: row.append(int(data_set[i][0])) col.append(int(data_set[i][1])-1) data.append(int(data_set[i][2])) # labels.append(int(data_label_hash[int(data_set[i][0])])) elif int(data_set[i][0]) in valid_ids: row_valid.append(int(data_set[i][0])) col_valid.append(int(data_set[i][1])-1) data_valid.append(int(data_set[i][2])) # labels_valid.append(int(data_label_hash[int(data_set[i][0])])) train = translate(row), col, data, labels valid = translate(row_valid), col_valid, data_valid, labels_valid return train, valid
def includes_for_type(idl_type): idl_type = idl_type.preprocessed_type # Composite types if idl_type.native_array_element_type: return includes_for_type(idl_type) # Simple types base_idl_type = idl_type.base_type if base_idl_type in INCLUDES_FOR_TYPE: return INCLUDES_FOR_TYPE[base_idl_type] if idl_type.is_basic_type: return set() if idl_type.is_typed_array_type: # Typed array factory methods are already provided by DartUtilities.h. return set([]) if base_idl_type.endswith('ConstructorConstructor'): # FIXME: rename to NamedConstructor # FIXME: replace with a [NamedConstructorAttribute] extended attribute # Ending with 'ConstructorConstructor' indicates a named constructor, # and these do not have header files, as they are part of the generated # bindings for the interface return set() if base_idl_type.endswith('Constructor'): # FIXME: replace with a [ConstructorAttribute] extended attribute base_idl_type = idl_type.constructor_type_name if base_idl_type not in component_dir: return set() return set(['gen/sky/bindings/Dart%s.h' % base_idl_type])
def load_data(name, plotdir, print_out=True): "Read data and split into train, test data." df = read_data(name) train, test = train_test_split(df, test_size=0.3) # plot_scatter_matrix(train, plotdir) # takes a while, not that useful yvars = ['risk', 'Y'] train_y = train[yvars] test_y = test[yvars] # train_r = train['risk'] # for five-way multi-class classification train = train.drop(['risk', 'Y'], axis=1) test = test.drop(['risk', 'Y'], axis=1) if print_out: print("train test types %s %s %s %s" % (type(train), type(test), type(train_y), type(test_y))) print("train test shapes %s %s %s %s" % (train.shape, test.shape, train_y.shape, test_y.shape)) print("train head\n%s" % (train[:3])) print("test head\n%s" % (test[:3])) print("train_y set %s, test_y set %s" % (set(train_y['Y']), set(test_y['Y']))) print("train_y stats\n%s\ntest_y stats\n%s" % (train_y.describe(), test_y.describe())) # drop_col = ['b_sugar_up'] # print('dropping high std/mean columns', drop_col) # train = train.drop(drop_col, axis=1) # test = test.drop(drop_col, axis=1) # drop_col = ['age','exer_slope'] # print('dropping low importance columns', drop_col) # train = train.drop(drop_col, axis=1) # test = test.drop(drop_col, axis=1) return train, test, train_y, test_y
def extract_features(self, tweet_message): if len(self.bag_of_words) == 0: printf('Bag-of-Words empty!') return None tweet_words = [word.lower() for word, tag in tweet_message if word not in stopwords and not word.isdigit()] tweet_tags = [tag[:2] for word, tag in tweet_message if word not in stopwords and not word.isdigit()] feature_set = {} # 1st set of features: bag-of-words for word in self.bag_of_words: feature_set['has_'+word] = (word in tweet_words) # 2nd set of features: the tags present in the message for tag in ['NN','VG','CD','JJ','CC','RB']: feature_set['has_'+tag] = (tag in tweet_tags) # 3rd feature: negation is present? negators = set(['not', 'none', 'nobody', 'never', 'nothing', 'lack', 't','n\'t','dont', 'no']) if len(negators.intersection(set(tweet_words))) > 0: feature_set['has_negator'] = True return feature_set
def copy_apps(hemps_path, testcase_path, apps_name_list): #-------------- COPIES ALL APP SOURCE FILES RELATED INTO TESTCASE FILE ---------------- source_app_path = hemps_path+"/applications/" testcase_app_path = testcase_path+"/applications/" create_ifn_exists(testcase_app_path) #for each app described into testcase file for app_name in apps_name_list: source_app_dir = source_app_path + app_name target_app_dir = testcase_app_path + app_name generic_copy(source_app_dir, target_app_dir, [".svn"]) apps_in_testcase = [] #List as directories from applications directory for tc_app in os.listdir(testcase_app_path): if os.path.isdir(testcase_app_path+tc_app): apps_in_testcase.append(tc_app) #Remove the apps already present into testcase to_remove_apps = list ( set(apps_in_testcase) - set(apps_name_list) ) for to_remove_app in to_remove_apps: delete_if_exists(testcase_app_path + to_remove_app)
def test_parameter_grid(): """Test basic properties of ParameterGrid.""" params1 = {"foo": [1, 2, 3]} grid1 = ParameterGrid(params1) assert_true(isinstance(grid1, Iterable)) assert_true(isinstance(grid1, Sized)) assert_equal(len(grid1), 3) params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]} grid2 = ParameterGrid(params2) assert_equal(len(grid2), 6) # loop to assert we can iterate over the grid multiple times for i in xrange(2): # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2) points = set(tuple(chain(*(sorted(p.items())))) for p in grid2) assert_equal(points, set(("bar", x, "foo", y) for x, y in product(params2["bar"], params2["foo"]))) # Special case: empty grid (useful to get default estimator settings) empty = ParameterGrid({}) assert_equal(len(empty), 1) assert_equal(list(empty), [{}]) has_empty = ParameterGrid([{'C': [1, 10]}, {}]) assert_equal(len(has_empty), 3) assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}])
def dedup_value(body, ctype, action="dedup_value", prop=None): ''' Service that accepts a JSON document and enriches the prop field of that document by: a) Removing duplicates ''' if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" for p in prop.split(","): if exists(data, p): v = getprop(data, p) if isinstance(v, list): # Remove whitespace, periods, parens, brackets clone = [re.sub("[ \.\(\)\[\]\{\}]", "", s).lower() for s in v] # Get index of unique values index = list(set([clone.index(s) for s in list(set(clone))])) setprop(data, p, [v[i] for i in index]) return json.dumps(data)
def _computeConflicts( self ): self.conflicts = [] self.warnings = [] nonterminalUsageMap = {N: list() for N in self.nonterminals} # maps nonterminal to rules that use this nonterminal in its production for R in self.expandedRules: for M in R.production.morphemes: if isinstance(M, NonTerminal): nonterminalUsageMap[M].append(R) for N in self.nonterminals: if self._empty in self.first[N] and len(self.first[N].intersection(self.follow[N])): self.conflicts.append( FirstFollowConflict( N, self.first[N], self.follow[N] ) ) if not len(nonterminalUsageMap[N]) and not N.generated: self.warnings.append(UnusedNonterminalWarning(N)) NR = self.getExpandedRules( N ) if len(NR) == 0: self.conflicts.append( UndefinedNonterminalConflict(N) ) for x in range(len(NR)): for y in range(len(NR)): if x == y: continue xR = self._pfirst(NR[x].production) yR = self._pfirst(NR[y].production) intersection = xR.intersection(yR.difference({self._empty})) if intersection != set(): self.conflicts.append( FirstFirstConflict(NR[x], NR[y], self) ) for macro in self.macros: if isinstance(macro, MorphemeListMacro): if self.first[macro.morpheme].intersection(self.follow[macro]) != set(): self.conflicts.append( ListFirstFollowConflict(macro, self.first[macro.nonterminal], self.follow[macro]) ) return self.conflicts
def get_sendable_users(self, project): conf_key = self.get_conf_key() alert_settings = dict( (o.user_id, int(o.value)) for o in UserOption.objects.filter( project=project, key='%s:alert' % conf_key, ) ) disabled = set(u for u, v in alert_settings.iteritems() if v == 0) member_set = set(project.member_set.exclude( user__in=disabled, ).values_list('user', flat=True)) # determine members default settings members_to_check = set(u for u in member_set if u not in alert_settings) if members_to_check: disabled = set(UserOption.objects.filter( key='subscribe_by_default', value='0', user__in=members_to_check, ).values_list('user', flat=True)) member_set = filter(lambda x: x not in disabled, member_set) return member_set
def test00_pg_hba_conf_file(self): os.environ[self.GP_COMMAND_FAULT_POINT] = 'gpexpand tar segment template' cmd = Command(name='run gpexpand', cmdStr='gpexpand -D %s -i %s' % (self.TEST_DB, self.EXPANSION_INPUT_FILE)) with self.assertRaisesRegexp(ExecutionError, 'Fault Injection'): cmd.run(validateAfter=True) #Read from the pg_hba.conf file and ensure that #The address of the new hosts is present. cmd = Command(name='get the temp pg_hba.conf file', cmdStr="ls %s" % os.path.join(os.path.dirname(self.MASTER_DATA_DIRECTORY), 'gpexpand*', 'pg_hba.conf')) cmd.run(validateAfter=True) results = cmd.get_results() temp_pg_hba_conf = results.stdout.strip() actual_values = set() expected_values = set([self.primary_host_address, self.mirror_host_address]) with open(temp_pg_hba_conf) as f: for line in f: if line.strip() == '# %s' % self.primary_host_name or\ line.strip() == '# %s' % self.mirror_host_name: address = f.next().strip().split()[3] address = address[:address.rfind('/')] actual_values.add(address) self.assertEqual(actual_values, expected_values) GpStart(name='start the database in master only mode', masterOnly=True).run(validateAfter=True) Command(name='rollback the expansion', cmdStr='gpexpand -r -D %s' % self.TEST_DB).run(validateAfter=True) GpStart(name='start the database').run(validateAfter=True)
def gen_mapping(data): data_set = set(data) return {x: y for x, y in zip(data_set, range(len(data_set)))}
from nltk.stem.lancaster import LancasterStemmer app = Flask(__name__) # Config MySQL app.config['MYSQL_HOST'] = 'localhost' app.config['MYSQL_USER'] = '******' app.config['MYSQL_PASSWORD'] = '******' app.config['MYSQL_DB'] = 'bass' app.config['MYSQL_CURSORCLASS'] = 'DictCursor' # Config Paths app.config['UPLOAD_PATH'] = "static/uploads/" app.config['PRODUCT_PATH'] = "static/products/" # Image extensions allowed ALLOWED_EXTENSIONS = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif']) # Init MySQL mysql = MySQL(app) ''' ============================================ RECOMMENDATION SYSTEM CODE STARTS ============================================ ''' # =============================================================== # TRAIN THE ENGINE # =============================================================== ds = pd.read_csv("recommendation//WebProducts.csv")
def load_policy(filename): def read_layer(l): assert list(l.keys()) == ['AffineLayer'] assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] W, b = l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) return lambda x: np.matmul(x, W) + b def build_nonlin_fn(nonlin_type): if nonlin_type == 'lrelu': leak = 0.01 # openai/imitation nn.py:233 return lambda x: 0.5 * (1 + leak) * x + 0.5 * (1 - leak) * np.abs(x) elif nonlin_type == 'tanh': return lambda x: np.tanh(x) else: raise NotImplementedError(nonlin_type) with open(filename, 'rb') as f: data = pickle.loads(f.read()) # assert len(data.keys()) == 2 nonlin_type = data['nonlin_type'] nonlin_fn = build_nonlin_fn(nonlin_type) policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) policy_params = data[policy_type] assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} # Build observation normalization layer assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) #print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) # Build hidden layers assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] layer_params = policy_params['hidden']['FeedforwardNet'] layers = [] for layer_name in sorted(layer_params.keys()): l = layer_params[layer_name] fc_layer = read_layer(l) layers += [fc_layer, nonlin_fn] # Build output layer fc_layer = read_layer(policy_params['out']) layers += [fc_layer] layers_forward = lambda inp: reduce(lambda x, fn: fn(x), [inp] + layers) def forward_pass(obs): ''' Build the forward pass for policy net. Input: batched observation. (shape: [batch_size, obs_dim]) Output: batched action. (shape: [batch_size, action_dim]) ''' obs = obs.astype(np.float32) normed_obs = (obs - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation output = layers_forward(normed_obs.astype(np.float32)) return output return forward_pass
def update(self): # Update parameter and experimental widgets for p in self._param_widgets: p.update() for e in self._expt_widgets: e.update() # Grab connector fit parameters and required meta data associated with # this experiment required_meta, connector_param = self._fit.get_experiment_connector(self._experiment) # ----------- connector parameters -------------- if set(connector_param) != self._current_connector_param: # Build connector widgets for associated parameters connector_keys = list(connector_param.keys()) connector_keys.sort() to_layout = [] for k in connector_keys: try: self._connector_widgets[k].update() except KeyError: self._connector_widgets[k] = FitParamWrapper(self, self._fit, self._experiment, connector_param[k]) to_layout.append(self._connector_widgets[k]) # Delete any existing connector fit parameter widgets from layout widget_indexes = list(range(self._num_local_param_widgets, self._fit_param_layout.count())) widget_indexes.reverse() for i in widget_indexes: self._fit_param_layout.itemAt(i).widget().setParent(None) # Add associated connector fit parameter widgets to layout for i, w in enumerate(to_layout): r = i + self._num_param_rows # Lock down the ability to chose a new linkage for this parameter w.set_as_connector_param(True) self._fit_param_layout.addWidget(QW.QLabel(w.name),r,0) self._fit_param_layout.addWidget(w.guess_widget,r,1) self._fit_param_layout.addWidget(w.alias_widget,r,2) self._fit_param_layout.addWidget(w.fixed_widget,r,3) self._fit_param_layout.addWidget(w.lower_widget,r,4) self._fit_param_layout.addWidget(w.upper_widget,r,5) self._current_connector_param = set(connector_param) # ------------- required experiment metadata ----------------- if set(required_meta) != self._current_required_meta: required_meta_keys = list(required_meta.keys()) required_meta_keys.sort() to_layout = [] for m in required_meta_keys: try: self._meta_widgets[m].update() except KeyError: self._meta_widgets[m] = ExperimentMetaWrapper(self, self._fit, self._experiment, m) to_layout.append(self._meta_widgets[m]) # Delete existing widgets from layout widget_indexes = list(range(self._num_exp_rows*3, self._experiment_settable_layout.count())) widget_indexes.reverse() for i in widget_indexes: self._experiment_settable_layout.itemAt(i).widget().setParent(None) # Add dummy widgets to fill out grid hider = QW.QSizePolicy() hider.setRetainSizeWhenHidden(True) dummies = [] counter = 0 while len(to_layout) % self._num_exp_columns != 0: # Add fake widget dummies.append(ExperimentSettableWrapper(self,self._fit, self._experiment, "dummy{}".format(counter), "",str,None)) dummies[-1].setSizePolicy(hider) to_layout.append(dummies[-1]) to_layout[-1].hide() counter += 1 # Lay out the connector widgets in rows of num_exp_columns. counter = 0 num_rows = int(round((len(to_layout)+1)/self._num_exp_columns)) for i in range(num_rows): r = i + self._num_exp_rows for j in range(self._num_exp_columns): self._experiment_settable_layout.addWidget(to_layout[counter],r,j) counter += 1 self._current_required_meta = set(required_meta) # For some reason this must be run twice to get correct size in all cases self.adjustSize() self.adjustSize()
def get_product(queries): return set(map(frozenset, product(*repeat(tuple(queries), len(queries)))))
def get_cluster_queries(clusters): """Based on a collection of clusters (for example those returned by get_clusters()), determine the query needed to fetch the articles in that particular cluster. """ all_queries = set(chain.from_iterable(clusters)) return (_get_cluster_query(all_queries, queries) for queries in clusters)
n, k = map(int, input().split()) #print(n,k) nums = list(map(int, input().split())) nums.sort() res = [] for i in range(n): for j in range(i + 1, n): for k2 in range(j + 1, n): res.append(nums[i] + nums[j] + nums[k2]) res = list(set(res)) res.sort(reverse=True) print(res[k - 1])
def users_with_common_genre_interest(identify): return set([ user["id"] for user in users if user["id"] != identify["id"] and (identify["genre interest"] == user["genre"]) ])
for f in sorted(files_child ): # sorted to ensure merge stability if f not in intersect: who = gglob_who_orig.copy() globs[os.path.relpath(f, start=globs_dir)] = who for who in oglob_who: if who not in gglob_who: gglob_who.append(who) add_parent_to_globs(owners.parent, globs, globs_dir) return assert (False) todo = owners_data.copy() done = set() with open(args.out, 'w') as out: out.write('# Auto-generated by the tools/mkowners/mkowners.py tool\n') out.write('# Uses OWNERS files in different modules throughout the\n') out.write('# repository as the source of truth for module ownership.\n') written_globs = [] while todo: head, *todo = todo if head.parent and not head.parent in done: todo.append(head) continue globs = expand_directives(head.dir, head.directives) add_parent_to_globs(head.parent, globs, head.dir) for glob, owners in globs.items(): skip = False for glob1, owners1, dir1 in reversed(written_globs):
def users_with_common_interests_and_genre_interest(user): return set([ interests_and_genre_interest_user_id for interests_and_genre_interest_user_id in users_with_common_genre_interest(user) if interests_and_genre_interest_user_id in users_with_common_interests(user) ])
#Generate triangle numbes triNum = [] for i in range(143, 100000): x = i * (i + 1) / 2 triNum.append(x) #Generate Pentagonal numbers pentNum = [] for i in range(143, 100000): x = i * (3 * i - 1) / 2 pentNum.append(x) #Generate hexagonal Numbers hexNum = [] for i in range(143, 100000): x = i * (2 * i - 1) hexNum.append(x) temp = list(set(triNum).intersection(pentNum)) fin = list(set(temp).intersection(hexNum)) print(fin)
def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False, verbose=True, datetime_index=False): """ Returns a dataframe from a QuerySet Optionally specify the field names/columns to utilize and a field as the index Parameters ---------- qs: The Django QuerySet. fieldnames: The model field names to use in creating the frame. You can span a relationship in the usual Django way by using double underscores to specify a related field in another model You can span a relationship in the usual Django way by using double underscores to specify a related field in another model index_col: specify the field to use for the index. If the index field is not in the field list it will be appended coerce_float : boolean, default False Attempt to convert values to non-string, non-numeric data (like decimal.Decimal) to floating point, useful for SQL result sets verbose: boolean If this is ``True`` then populate the DataFrame with the human readable versions of any foreign key fields else use the primary keys values. The human readable version of the foreign key field is defined in the ``__unicode__`` or ``__str__`` methods of the related class definition datetime_index: specify whether index should be converted to a DateTimeIndex. """ if fieldnames: fieldnames = pd.unique(fieldnames) if index_col is not None and index_col not in fieldnames: # Add it to the field names if not already there fieldnames = tuple(fieldnames) + (index_col, ) fields = to_fields(qs, fieldnames) elif is_values_queryset(qs): if django.VERSION < (1, 9): # pragma: no cover annotation_field_names = list(qs.query.annotation_select) if annotation_field_names is None: annotation_field_names = [] extra_field_names = qs.extra_names if extra_field_names is None: extra_field_names = [] select_field_names = qs.field_names else: # pragma: no cover annotation_field_names = list(qs.query.annotation_select) extra_field_names = list(qs.query.extra_select) select_field_names = list(qs.query.values_select) fieldnames = select_field_names + annotation_field_names + \ extra_field_names fields = [None if '__' in f else qs.model._meta.get_field(f) for f in select_field_names] + \ [None] * (len(annotation_field_names) + len(extra_field_names)) uniq_fields = set() fieldnames, fields = zip( *(f for f in zip(fieldnames, fields) if f[0] not in uniq_fields and not uniq_fields.add(f[0]))) else: fields = qs.model._meta.fields fieldnames = [f.name for f in fields] fieldnames += list(qs.query.annotation_select.keys()) if is_values_queryset(qs): recs = list(qs) else: recs = list(qs.values_list(*fieldnames)) df = pd.DataFrame.from_records(recs, columns=fieldnames, coerce_float=coerce_float) if verbose: update_with_verbose(df, fieldnames, fields) if index_col is not None: df.set_index(index_col, inplace=True) if datetime_index: df.index = pd.to_datetime(df.index, errors="ignore") return df
def __init__(self, grams=set(string.hexdigits.lower()), limit=3): self.grams = grams self.limit = limit
import sys import os import argparse from collections import namedtuple from datetime import date, datetime, timedelta import sqlite3 from foodlog.my_info import config_path INVALID_TEMPLATE = """ {} {} """ config = config_path() # pylint: disable=invalid-name DB_FILE = config.dir('DB_FILE') MENU_URL = config.dir('MENU_URL') VIEW_MENU_URL = config.dir('VIEW_MENU_URL') VALID = set('start end range title reverse edit'.split()) VALID_RANGES = set('today yesterday lastweek thisweek'.split()) def print_error(header, text): print(INVALID_TEMPLATE.format(header, text)) sys.exit(2) def week_range(num_weeks, firstweekday=3): """ Return the range num_weeks ago Figure out the week where num_weeks == 0 is this week (contains today) and week == 1 is last week, and so on. Weeks are defined by start_day using the datetime.weekday(), so if start_day == 0, the week starts on
def createTree(dataSet, minSup=1): """ 生成FP树 Args: dataSet dist{行:出现次数}的样本数据 minSup 最小的支持度 Returns: retTree FP-tree headerTable 满足minSup {所有的元素+(value, treeNode)} """ # 支持度>=minSup的dist{所有元素:出现的次数} headerTable = {} # 循环 dist{行:出现次数}的样本数据 for trans in dataSet: # 对所有的行进行循环,得到行里面的所有元素 # 统计每一行中,每个元素出现的总次数 for item in trans: # 例如: {'ababa': 3} count(a)=3+3+3=9 count(b)=3+3=6 headerTable[item] = headerTable.get(item, 0) + dataSet[trans] # 删除 headerTable中,元素次数<最小支持度的元素 for k in list(headerTable.keys()): # python3中.keys()返回的是迭代器不是list,不能在遍历时对其改变。 if headerTable[k] < minSup: del(headerTable[k]) # 满足minSup: set(各元素集合) freqItemSet = set(headerTable.keys()) # 如果不存在,直接返回None if len(freqItemSet) == 0: return None, None for k in headerTable: # 格式化: dist{元素key: [元素次数, None]} headerTable[k] = [headerTable[k], None] # 树根 retTree = treeNode('Null Set', 1, None) # 循环 dist{行:出现次数}的样本数据 for tranSet, count in dataSet.items(): # localD = dist{元素key: 元素总出现次数} localD = {} for item in tranSet: # 判断是否在满足minSup的集合中 if item in freqItemSet: localD[item] = headerTable[item][0] # 对每一行的key 进行排序,然后开始往树添加枝丫,直到丰满 # 第二次,如果在同一个排名下出现,那么就对该枝丫的值进行追加,继续递归调用! if len(localD) > 0: # p=key,value; 所以是通过value值的大小,进行从大到小进行排序 # orderedItems 表示取出元组的key值,也就是字母本身,但是字母本身是大到小的顺序 orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] # 填充树,通过有序的orderedItems的第一位,进行顺序填充 第一层的子节点。 updateTree(orderedItems, retTree, headerTable, count) return retTree, headerTable
def extract_custom_form_info(req_id, form_id, form_soup): """Extract all of the fields passed into the form. Arguments: req_id (String): The unique string of ints that map to a request (URI). form_id (String): The unique string of ints that map to a form. form_soup (BeautifulSoup object): The soup of the form you want to parse. Returns: form_info (CustomForm): The CustomForm object with all of the form's fields initialized. Raises: TypeError: The form has no fields configured. ValueError: The form has duplicate samples. """ # If we need any of these types, we can make new methods. skip_types = ["charges", "file", "table", "help", "file_no_upload"] field_strategy = { "handsontable_grid": extract_custom_forms.grid_type, "checkbox": extract_custom_forms.checkbox_type, "all_others": extract_custom_forms.all_other_types } # Find the desired custom form out of all of the form_soup. target_form = form_soup.find(string=form_id) target_form = target_form.find_parent("custom-form") form_soup = target_form form_name = form_soup.find("name").string fields_soup = form_soup.find("fields") form_info = api_types.CustomForm(form_name, req_id, form_id) # Get all of the field information. for field_soup in fields_soup.find_all("field"): field_type = field_soup.find("type").string if field_type in skip_types: # Do nothing with the field types that we don't yet care about. continue try: field_strategy[field_type](field_soup, form_info) except KeyError: field_strategy["all_others"](field_soup, form_info) except TypeError: raise TypeError( f"The grid in the {form_info.name} form in request" f" {form_info.req_id} has been filled out incorrectly. The" f" error message is: {traceback.format_exc()}") # Raise an error if a form doesn't have samples. if not form_info.samples: return form_info if form_info.field_to_values.get("duplicate_samples"): if form_info.field_to_values["duplicate_samples"] == "Yes": b_samples = copy.deepcopy(form_info.samples) for a_sample, b_sample in zip(form_info.samples, b_samples): a_sample.name += "A" b_sample.name += "B" form_info.samples = form_info.samples + b_samples extract_custom_forms.bind_container_info(form_info) # Allows duplicate names if they have different well locations in a # plate. if form_info.con_type != "96 well plate": sample_names = [sample.name for sample in form_info.samples] if len(set(sample_names)) != len(sample_names): raise ValueError( f"There are two or more samples named the same thing in" f" request {form_info.req_id}. Please review and edit your" f" sample names.") for name, value in form_info.field_to_values.items(): if name in ONLY_INT_FIELDS: value = re.sub(r"[^.0-9]", "", value) if "_each_sample" in name: udf_name = name.replace("_each_sample", "").replace("_", " ") for sample in form_info.samples: sample.udf_to_value[udf_name] = value return form_info
def keys(self): res = set() for key in chain_from_iterable(self._maps): res.add(key) return list(res)
# coding: utf-8 import sys, json, codecs reload(sys) sys.setdefaultencoding('utf-8') fin = codecs.open("P_list", encoding="utf-8") small_train_data_P_set = set([]) for line in fin: small_train_data_P_set.add(line.strip()) fin.close() # 对于label 为 0的, 去掉 neg:far_apart 这条规则 def main(in_file, to_file): fin = open(in_file) fout = open(to_file, "w") for line in fin: line_list = line.strip().split("\t") dict_label_info = json.loads(line_list[-1]) flag = 0 for P in dict_label_info: # 是我要处理的P if P in small_train_data_P_set: # 遍历candidates for s_o in dict_label_info[P]["candidates"]: label = dict_label_info[P]["candidates"][s_o]["label"] label_rule_list = dict_label_info[P]["candidates"][s_o]["label_info"] # 是NULL类型的, 重新生成label if label == 0:
# -*- coding: utf-8 -*- # read puzzle input puzzle_input = [] all_allergens = [] all_ingredients = [] for l in open("input_test.txt", "r").readlines(): ingredients, allergens = l.split(" (contains ") ingredients = ingredients.split(" ") all_ingredients.extend(ingredients) allergens = allergens.strip() allergens = allergens[:-1].split(", ") all_allergens.extend(allergens) puzzle_input.append([ingredients, allergens]) all_allergens = set(all_allergens) all_ingredients = set(all_ingredients) ingredients_dict = {} for ingredient in all_ingredients: ingredients_dict[ingredient] = [] for food in puzzle_input: if ingredient in food[0]: ingredients_dict[ingredient].extend(food[1]) allergens_dict = {} for allergen in all_allergens: allergens_dict[allergen] = [] for food in puzzle_input: if allergen in food[1]: allergens_dict[allergen].append(food[0])
def _run_test(self, extr, url, result): if result: if "options" in result: for key, value in result["options"]: key = key.split(".") config.set(key[:-1], key[-1], value) if "range" in result: config.set((), "image-range" , result["range"]) config.set((), "chapter-range", result["range"]) content = "content" in result else: content = False tjob = ResultJob(url, content=content) self.assertEqual(extr, tjob.extractor.__class__) if not result: return if "exception" in result: with self.assertRaises(result["exception"]): tjob.run() return try: tjob.run() except exception.StopExtraction: pass except exception.HttpError as exc: exc = str(exc) if re.match(r"'5\d\d ", exc) or \ re.search(r"\bRead timed out\b", exc): self._skipped.append((url, exc)) self.skipTest(exc) raise if result.get("archive", True): self.assertEqual( len(set(tjob.archive_list)), len(tjob.archive_list), "archive-id uniqueness", ) if tjob.queue: # test '_extractor' entries for url, kwdict in zip(tjob.url_list, tjob.kwdict_list): if "_extractor" in kwdict: extr = kwdict["_extractor"].from_url(url) self.assertIsInstance(extr, kwdict["_extractor"]) self.assertEqual(extr.url, url) else: # test 'extension' entries for kwdict in tjob.kwdict_list: self.assertIn("extension", kwdict) # test extraction results if "url" in result: self.assertEqual(result["url"], tjob.url_hash.hexdigest()) if "content" in result: expected = result["content"] digest = tjob.content_hash.hexdigest() if isinstance(expected, str): self.assertEqual(digest, expected, "content") else: # assume iterable self.assertIn(digest, expected, "content") if "keyword" in result: expected = result["keyword"] if isinstance(expected, dict): for kwdict in tjob.kwdict_list: self._test_kwdict(kwdict, expected) else: # assume SHA1 hash self.assertEqual(expected, tjob.kwdict_hash.hexdigest()) if "count" in result: count = result["count"] if isinstance(count, str): self.assertRegex(count, r"^ *(==|!=|<|<=|>|>=) *\d+ *$") expr = "{} {}".format(len(tjob.url_list), count) self.assertTrue(eval(expr), msg=expr) else: # assume integer self.assertEqual(len(tjob.url_list), count) if "pattern" in result: self.assertGreater(len(tjob.url_list), 0) for url in tjob.url_list: self.assertRegex(url, result["pattern"])
def __init__(self, tokens): """Build our smoothed trigram model. This should be very similar to SimpleTrigramLM.__init__ from the demo notebook, with the exception that we _don't_ want to actually normalize the probabilities at training time. Instead, we'll compute the corpus counts C_abc = C(w_2, w_1, w) and C_ab = C(w_2, w_1), after which we can compute the probabilities on the fly for any value of k. (We'll do this in the next_word_proba() function.) The starter code will fill in: self.counts (trigram counts) self.words (list of words known to the model) Your code should populate: self.context_totals (total count C_ab for context ab) Args: tokens: (list or np.array) of training tokens Returns: None """ self.k = 0.0 # Raw trigram counts over the corpus. # c(w | w_1 w_2) = self.counts[(w_2,w_1)][w] # Be sure to use tuples (w_2,w_1) as keys, *not* lists [w_2,w_1] self.counts = defaultdict(lambda: defaultdict(lambda: 0.0)) # Map of (w_1, w_2) -> int # Entries are c( w_2, w_1 ) = sum_w c(w_2, w_1, w) self.context_totals = defaultdict(lambda: 0.0) # Track unique words seen, for normalization # Use wordset.add(word) to add words wordset = set() # Iterate through the word stream once # Compute trigram counts as in SimpleTrigramLM w_1, w_2 = None, None for word in tokens: wordset.add(word) if w_1 is not None and w_2 is not None: self.counts[(w_2,w_1)][word] += 1 # Update context w_2 = w_1 w_1 = word #### YOUR CODE HERE #### # Compute context counts for context, words in self.counts.items(): self.context_totals[context] = sum(words[w] for w in words) #### END(YOUR CODE) #### # Freeze defaultdicts so we don't accidentally modify later. self.counts.default_factory = None for k in self.counts: if isinstance(self.counts[k], defaultdict): self.counts[k].default_factory = None # Total vocabulary size, for normalization self.words = list(wordset) self.V = len(self.words)
import spacy import json import os from collections import defaultdict nlp = spacy.load('en') PATH = os.getcwd() sentences = [] with open(PATH + '\\data\\proc_sen.json', 'r+')as fp: proc_sen = json.load(fp) uniq_words = set() count_lemma = defaultdict(int) c = 0 for sen in proc_sen: c += 1 if c % 100 == 0: print(c) for token in sen: uniq_words.add(token[1]) with open(PATH + '\\data\\uniq_words.json', 'w+') as fp: json.dump(list(uniq_words), fp) with open(PATH + '\\data\\uniq_words.txt', 'w+') as fp: for w in uniq_words: fp.write(w + '\n')
STOP_WORDS = set([ "i", "me", "my", "myself", "we", "our", "ours", "ourself", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "herself", "it", "its", "itself", "they", "them", "their", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "have", "has", "had", "do", "does", "did", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "can", "will", "just", "don", "should", "now", "say", "tell", "told", "said", "would", "could", "might", "shall", "nt", "also", "L:", "P:", "O:", "s", "t", "m", "re", "ll", "d" ]) #STOP_WORDS = ["i", "me", "my", "myself", "we", "our", "ourselves", "you", "your", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "herself", "it", "itself", "they", "them", "their", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "have", "has", "had", "do", "does", "did", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will" , "just", "don", "should", "now", "said", "would", "nt","&"] WORD2NUM = { 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'zero': '0' }
def __init__(self, tokens): """Build our smoothed trigram model. This should be similar to the AddKTrigramLM.__init__ function, above, but will compute a number of additional quantities that we need for the more sophisticated KN model. See the documentation in the notebook for the KN backoff model definition and equations, and be sure to read the in-line comments carefully to understand what each data structure represents. Note the usual identification of variables: w : c : current word w_1 : w_{i-1} : b : previous word w_2 : w_{i-2} : a : previous-previous word There are two blocks of code to fill here. In the first one, you should fill in the inner loop to compute: self.counts (unigram, bigram, and trigram) self.type_contexts (set of preceding words for each word (type)) In the second one, you should compute: self.context_totals (as in AddKTrigramLM) self.context_nnz (number of nonzero elements for each context) self.type_fertility (number of unique preceding words for each word (type)) The starter code will fill in: self.z_tf (normalization constant for type fertilities) self.words (list of words known to the model) Args: tokens: (list or np.array) of training tokens Returns: None """ self.delta = 0.75 # Raw counts over the corpus. # Keys are context (N-1)-grams, values are dicts of word -> count. # You can access C(w | w_{i-1}, ...) as: # unigram: self.counts[()][w] # bigram: self.counts[(w_1,)][w] # trigram: self.counts[(w_2,w_1)][w] self.counts = defaultdict(lambda: defaultdict(lambda: 0)) # As in AddKTrigramLM, but also store the unigram and bigram counts # self.context_totals[()] = (total word count) # self.context_totals[(w_1,)] = c(w_1) # self.context_totals[(w_2, w_1)] = c(w_2, w_1) self.context_totals = defaultdict(lambda: 0.0) # Also store in self.context_nnz the number of nonzero entries for each # context; as long as \delta < 1 this is equal to nnz(context) as # defined in the notebook. self.context_nnz = defaultdict(lambda: 0.0) # Context types: store the set of preceding words for each word # map word -> {preceding_types} self.type_contexts = defaultdict(lambda: set()) # Type fertility is the size of the set above # map word -> |preceding_types| self.type_fertility = defaultdict(lambda: 0.0) # z_tf is the sum of type fertilities self.z_tf = 0.0 # Iterate through the word stream once # Compute unigram, bigram, trigram counts and type fertilities w_1, w_2 = None, None for word in tokens: #### YOUR CODE HERE #### # Unigram counts self.counts[()][word] += 1 if w_1 is not None: # Bigram counts self.counts[(w_1,)][word] += 1 # Unique context words for each word self.type_contexts[word].add(w_1) if w_2 is not None: # Trigram counts self.counts[(w_2,w_1)][word] += 1 #### END(YOUR CODE) #### # Update context w_2 = w_1 w_1 = word ## # We'll compute type fertilities and normalization constants now, # but not actually store the normalized probabilities. That way, we can compute # them (efficiently) on the fly. #### YOUR CODE HERE #### # Count the total for each context. for context, words in self.counts.items(): self.context_totals[context] = sum(words[w] for w in words) # Count the number of nonzero entries for each context. for word, cnt in words.items(): if cnt > self.delta: self.context_nnz[context] += 1 # Compute type fertilities, and the sum z_tf. for word, context in self.type_contexts.items(): self.type_fertility[word] = len(context) self.z_tf = float(sum(self.type_fertility.values())) #### END(YOUR CODE) #### # Freeze defaultdicts so we don't accidentally modify later. self.counts.default_factory = None self.type_contexts.default_factory = None # Total vocabulary size, for normalization self.words = list(self.counts[()].keys()) self.V = len(self.words)
def __init__(self, func, _type): super().__init__(func, "on_cap_{}".format(_type)) self.caps = set()
def __init__(self, url, dirname, secret): self.url = url self.dirname = dirname self.secret = secret self.known = set()
def __init__(self, function): """ :type function: function """ _Hook.__init__(self, function, "irc_raw") self.triggers = set()
def __init__(self, func): super().__init__(func, "perm_check") self.perms = set()
class JsonLexer(Lexer): """ For JSON data structures. .. versionadded:: 1.5 """ name = 'JSON' aliases = ['json'] filenames = ['*.json', 'Pipfile.lock'] mimetypes = ['application/json'] # No validation of integers, floats, or constants is done. # As long as the characters are members of the following # sets, the token will be considered valid. For example, # # "--1--" is parsed as an integer # "1...eee" is parsed as a float # "trustful" is parsed as a constant # integers = set('-0123456789') floats = set('.eE+') constants = set('truefalsenull') # true|false|null hexadecimals = set('0123456789abcdefABCDEF') punctuations = set('{}[],') whitespaces = {'\u0020', '\u000a', '\u000d', '\u0009'} def get_tokens_unprocessed(self, text): """Parse JSON data.""" in_string = False in_escape = False in_unicode_escape = 0 in_whitespace = False in_constant = False in_number = False in_float = False in_punctuation = False start = 0 # The queue is used to store data that may need to be tokenized # differently based on what follows. In particular, JSON object # keys are tokenized differently than string values, but cannot # be distinguished until punctuation is encountered outside the # string. # # A ":" character after the string indicates that the string is # an object key; any other character indicates the string is a # regular string value. # # The queue holds tuples that contain the following data: # # (start_index, token_type, text) # # By default the token type of text in double quotes is # String.Double. The token type will be replaced if a colon # is encountered after the string closes. # queue = [] for stop, character in enumerate(text): if in_string: if in_unicode_escape: if character in self.hexadecimals: in_unicode_escape -= 1 if not in_unicode_escape: in_escape = False else: in_unicode_escape = 0 in_escape = False elif in_escape: if character == 'u': in_unicode_escape = 4 else: in_escape = False elif character == '\\': in_escape = True elif character == '"': queue.append((start, String.Double, text[start:stop + 1])) in_string = False in_escape = False in_unicode_escape = 0 continue elif in_whitespace: if character in self.whitespaces: continue if queue: queue.append((start, Text, text[start:stop])) else: yield start, Text, text[start:stop] in_whitespace = False # Fall through so the new character can be evaluated. elif in_constant: if character in self.constants: continue yield start, Keyword.Constant, text[start:stop] in_constant = False # Fall through so the new character can be evaluated. elif in_number: if character in self.integers: continue elif character in self.floats: in_float = True continue if in_float: yield start, Number.Float, text[start:stop] else: yield start, Number.Integer, text[start:stop] in_number = False in_float = False # Fall through so the new character can be evaluated. elif in_punctuation: if character in self.punctuations: continue yield start, Punctuation, text[start:stop] in_punctuation = False # Fall through so the new character can be evaluated. start = stop if character == '"': in_string = True elif character in self.whitespaces: in_whitespace = True elif character in {'f', 'n', 't'}: # The first letters of true|false|null # Exhaust the queue. Accept the existing token types. yield from queue queue.clear() in_constant = True elif character in self.integers: # Exhaust the queue. Accept the existing token types. yield from queue queue.clear() in_number = True elif character == ':': # Yield from the queue. Replace string token types. for _start, _token, _text in queue: if _token is Text: yield _start, _token, _text elif _token is String.Double: yield _start, Name.Tag, _text else: yield _start, Error, _text queue.clear() in_punctuation = True elif character in self.punctuations: # Exhaust the queue. Accept the existing token types. yield from queue queue.clear() in_punctuation = True else: # Exhaust the queue. Accept the existing token types. yield from queue queue.clear() yield start, Error, character # Yield any remaining text. yield from queue if in_string: yield start, Error, text[start:] elif in_float: yield start, Number.Float, text[start:] elif in_number: yield start, Number.Integer, text[start:] elif in_constant: yield start, Keyword.Constant, text[start:] elif in_whitespace: yield start, Text, text[start:] elif in_punctuation: yield start, Punctuation, text[start:]
def __init__(self, function): """ :type function: function """ _Hook.__init__(self, function, "event") self.types = set()