def create_sfs_dict(inpath): # Initialize sfs dict sfs_dict = {} # Get files from inpath files = os.listdir(inpath) for filename in tqdm.tqdm(files): # print(filename) if filename.endswith(".vcf"): # print(filename) # Load in VCF file my_vcf = VCF(inpath + '/' + filename) # Split VCF filename and split to extract population size, bottleneck and generation split_filename = filename.split('_') N_sims = int(split_filename[1].split('N')[1]) bot = float(split_filename[2].split('bot')[1]) gen = int(split_filename[3].split('gen')[1].split('.')[0]) # print(N_sims, bot, gen) N_samples = len(my_vcf.samples) # Initialize list for sfs sfs_list = [0] * ((2 * N_samples) + 1) # Add singletons, doubletons, ...n-tons for variant in my_vcf: AC = variant.INFO.get('AC') # print(AC) # idx = line.find(';AC=', 0, 150) # if(idx == -1): # continue # AC = int(line[idx + 4:idx + 8].split(';')[0]) sfs_list[AC] += 1 # Add invariant sites to sfs list ch_length = 1e8 sfs_list[0] = int(ch_length - sum(sfs_list)) # print(sfs_list) # Use sfs list to instantiate SFS class (Rob's code) sfs = SFS.SFS(sfs_list) # Create string for dictionary key l1 = str(N_sims) + '-' + str(bot) # Add SFS class instances to appropriate dictionary keys. if l1 in sfs_dict: sfs_dict[l1][gen] = sfs else: sfs_dict[l1] = {gen: sfs} return(sfs_dict)
def WikiUpdate(self, wikipage): import Keyword re_firstchar = re.compile(r'(\w)', re.UNICODE).search # signs of UTF8 mojibake if isinstance(wikipage,unicode) and u'\xc3' in wikipage: wikipage = wikipage.encode('latin-1').decode('utf-8') import Wiki wikimgr = Wiki.WikiManager() wikimgr.Download(wikipage) wikimgr.Parse(wikipage) wikimgr.Relate(wikipage) if wikipage.startswith("SFS/"): import SFS sfsnr = wikipage.split("/",1)[1] sfsmgr = SFS.SFSManager() sfsmgr.config['sfs']['generate_force'] = 'True' sfsmgr.Generate(sfsnr) elif wikipage.startswith("Dom/"): pass elif ":" in wikipage: # not in default namespace, probably (namespace,localpage) = wikipage.split(":",1) if namespace == "Kategori": firstletter = re_firstchar(localpage).group(0) basefile = u'%s/%s' % (firstletter,localpage) kwmgr = Keyword.KeywordManager() kwmgr.config['keyword']['parse_force'] = 'True' kwmgr.config['keyword']['generate_force'] = 'True' kwmgr.Parse(basefile,wiki_keyword=True) # needed for new terms kwmgr.Generate(basefile) elif namespace == "Lagen.nu" and localpage == "Huvudsida": self._prep_frontpage() infile = Util.relpath(os.path.sep.join([__scriptdir__,'static','index.xht2'])) outfile = Util.relpath(os.path.sep.join([self.baseDir,'site','generated','index.html'])) print "in %s, out %s" % (infile,outfile) Util.transform(__scriptdir__+"/xsl/static.xsl", infile, outfile, validate=False, xinclude=True) else: firstletter = re_firstchar(wikipage).group(0) basefile = u'%s/%s' % (firstletter,wikipage) kwmgr = Keyword.KeywordManager() kwmgr.config['keyword']['parse_force'] = 'True' kwmgr.config['keyword']['generate_force'] = 'True' kwmgr.Parse(basefile,wiki_keyword=True) # needed for new terms # raise ValueError(repr(basefile)) kwmgr.Generate(basefile)
def create_sfs_dict(inpath, seq_length): """Creates Site-frequency spectrum from VCF Args: inpath (str): Path with multiple bgzipped VCFs Returns: sfs_dict (dict of str: list): Dictionary with simulation as keys and site-frequency spectrum as list value """ print('Creating dictionary of Site Frequency Sprectra from VCFs in {0}'. format(inpath)) # Initialize sfs dict sfs_dict = {} # Get files from inpath files = glob(inpath + '**/*.vcf.gz', recursive=True) total_vcfs = 0 for vcf in tqdm(files): # Load in VCF file my_vcf = VCF(vcf) # # Split VCF filename and split to extract population size, bottleneck and generation split_filename = vcf.split('/')[-1].split('_') N_sims = int(split_filename[0].split('N')[1]) bot = float(split_filename[1].split('bot')[1]) gen = int(split_filename[2].split('gen')[1].split('.')[0]) # print(N_sims, bot, gen) N_samples = len(my_vcf.samples) # Initialize list for sfs sfs_list = [0] * ((2 * N_samples) + 1) # Add singletons, doubletons, ...n-tons for variant in my_vcf: AC = variant.INFO.get('AC') sfs_list[AC] += 1 # Add invariant sites to sfs list sfs_list[0] = int(seq_length - sum(sfs_list)) # print(sfs_list) # Use sfs list to instantiate SFS class (Rob's code) sfs = SFS.SFS(sfs_list) # Create string for dictionary key l1 = str(N_sims) + '-' + str(bot) # Add SFS class instances to appropriate dictionary keys. if l1 in sfs_dict: sfs_dict[l1][gen] = sfs else: sfs_dict[l1] = {gen: sfs} total_vcfs += 1 print('Processed {0} VCFs'.format(total_vcfs)) return sfs_dict
myargs = [arg.decode(coding) for arg in sys.argv] # ask for description and place it alongside # copy the modified file to a safe place file_to_patch = myargs[1].replace("\\","/") # normalize tmpfile = mktemp() copy2(file_to_patch,tmpfile) # Run SFSParser._extractSFST() (and place the file in the correct location) # or DVParser.word_to_docbook() if "/sfs/intermediate/" in file_to_patch: source = "sfs" basefile = file_to_patch.split("/sfs/intermediate/")[1] import SFS p = SFS.SFSParser() sourcefile = file_to_patch.replace("/intermediate/", "/downloaded/sfst/").replace(".txt", ".html") print "source %s, basefile %s, sourcefile %s" % (source,basefile,sourcefile) plaintext = p._extractSFST([sourcefile]) f = codecs.open(file_to_patch, "w",'iso-8859-1', errors="xmlcharrefreplace") f.write(plaintext+"\n") f.close() print "Wrote %s bytes to %s" % (len(plaintext), file_to_patch) elif "/dv/intermediate/docbook/" in file_to_patch: source = "dv" basefile = file_to_patch.split("/dv/intermediate/docbook/")[1] import DV p = DV.DVParser() sourcefile = file_to_patch.replace("/docbook/", "/word/").replace(".xml", ".doc") print "source %r, basefile %r, sourcefile %r" % (source,basefile,sourcefile)
posiciones_test = np.append(posiciones_test, pos_tipo[mitad:len(pos_tipo)]) # Ahora metemos en cuatro vectores, dos para datos y otros dos para clases, los que son para entrenamiento y los que son para test, según las # posiciones que acabamos de obtener datos_train = np.array([datos[i] for i in posiciones_train]) clases_train = np.array([clases[i] for i in posiciones_train]) datos_test = np.array([datos[i] for i in posiciones_test]) clases_test = np.array([clases[i] for i in posiciones_test]) # Creamos el knn knnGPU = knnLooGPU(len(datos_train), len(datos_test), len(datos_train[0]), 3) if args.algoritmo == 1: print("Greedy") com = time.time() mejores_car, tasa = SFS(clases_train, datos_train, knnGPU) fin = time.time() elif args.algoritmo == 2: print("AM1") com = time.time() mejores_car, tasa = AM1(clases_train, datos_train, knnGPU) fin = time.time() elif args.algoritmo == 3: print("AM2") com = time.time() mejores_car, tasa = AM2(clases_train, datos_train, knnGPU) fin = time.time() elif args.algoritmo == 4: print("AM3") com = time.time() mejores_car, tasa = AM3(clases_train, datos_train, knnGPU)
def __init__(self): super(KeywordManager, self).__init__() # we use the display_title function import SFS self.sfsmgr = SFS.SFSManager()
def fit(self, X, y,n_trees, sample_mask=None, X_argsorted=None, check_input=True, sample_weight=None, topics = [], featuresToAdd=[], enrichment_proportion=2/3,threshold=0): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. Use ``dtype=np.float32`` for maximum efficiency. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Deprecations if sample_mask is not None: warn("The sample_mask parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning) if X_argsorted is not None: warn("The X_argsorted parameter is deprecated as of version 0.14 " "and will be removed in 0.16.", DeprecationWarning) # Convert data if check_input: X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) # prepare features in case of semantic classification UsedFeatures = [] #print self.n_features_ EnrichmentMatrix = {} if (self.max_features == "semantic"): AllFeatures = np.arange(self.n_features_, dtype=np.int32) max_features = max(1, int(np.sqrt(self.n_features_)*enrichment_proportion)) sqrt_features=max(1, int(np.sqrt(self.n_features_))) random_state.shuffle(AllFeatures) for i in range(max_features): UsedFeatures = UsedFeatures + SFS.findSemanticWord(AllFeatures[i],topics,threshold) UsedFeatures = list(set(UsedFeatures)) elif (self.max_features == "distributed_semantic"): numberFeatureToAdd = int(len(topics[2])/n_trees) if numberFeatureToAdd <1: numberFeatureToAdd = 1 for topic in topics: random_state.shuffle(topic) for i in range(numberFeatureToAdd): UsedFeatures.append(topic[i]) random_state.shuffle(featuresToAdd) featuresNotAssigned= int(len(featuresToAdd)/n_trees) for f in range(featuresNotAssigned): UsedFeatures.append(featuresToAdd[f]) elif (self.max_features == "semantic_node"): AllFeatures = np.arange(self.n_features_, dtype=np.int32) for i in range(self.n_features_): EnrichmentMatrix[i] = list(set(SFS.findSemanticWord(AllFeatures[i],topics,threshold))) y = np.atleast_1d(y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] for k in xrange(self.n_outputs_): classes_k, y[:, k] = unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth if isinstance(self.max_features, six.string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) elif self.max_features == "semantic": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "semantic_node": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "distributed_semantic": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "distributed_semantic_node": max_features = max(1, int(np.sqrt(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray( sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, max_features, self.min_samples_leaf, random_state) self.criterion_ = criterion self.splitter_ = splitter self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_, splitter, max_depth, min_samples_split, self.min_samples_leaf, random_state) if (self.max_features == "semantic"): self.tree_.build(X, y, FeaturesToUse = UsedFeatures, sample_weight=sample_weight) elif (self.max_features == "semantic_node"): self.tree_.build(X, y, EnrichmentMatrix = EnrichmentMatrix, sample_weight=sample_weight) elif (self.max_features == "distributed_semantic"): self.tree_.build(X, y, FeaturesToUse = UsedFeatures, sample_weight=sample_weight) elif (self.max_features == "distributed_semantic_node"): self.tree_.build(X, y, topicsToUse =topics, featuresNotAssToUse=featuresToAdd , sample_weight=sample_weight) else: self.tree_.build(X, y, sample_weight=sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self