Esempio n. 1
0
def create_sfs_dict(inpath):

    # Initialize sfs dict
    sfs_dict = {}

    # Get files from inpath
    files = os.listdir(inpath)

    for filename in tqdm.tqdm(files):
        # print(filename)

        if filename.endswith(".vcf"):
            # print(filename)

            # Load in VCF file
            my_vcf = VCF(inpath + '/' + filename)

            # Split VCF filename and split to extract population size, bottleneck and generation
            split_filename = filename.split('_')
            N_sims = int(split_filename[1].split('N')[1])
            bot = float(split_filename[2].split('bot')[1])
            gen = int(split_filename[3].split('gen')[1].split('.')[0])
            # print(N_sims, bot, gen)
            N_samples = len(my_vcf.samples)

            # Initialize list for sfs
            sfs_list = [0] * ((2 * N_samples) + 1)

            # Add singletons, doubletons, ...n-tons
            for variant in my_vcf:
                AC = variant.INFO.get('AC')
                # print(AC)
                # idx = line.find(';AC=', 0, 150)
                # if(idx == -1):
                #     continue
                # AC = int(line[idx + 4:idx + 8].split(';')[0])
                sfs_list[AC] += 1

            # Add invariant sites to sfs list
            ch_length = 1e8
            sfs_list[0] = int(ch_length - sum(sfs_list))
            # print(sfs_list)

            # Use sfs list to instantiate SFS class (Rob's code)
            sfs = SFS.SFS(sfs_list)

            # Create string for dictionary key
            l1 = str(N_sims) + '-' + str(bot)

            # Add SFS class instances to appropriate dictionary keys.
            if l1 in sfs_dict:
                sfs_dict[l1][gen] = sfs
            else:
                sfs_dict[l1] = {gen: sfs}

    return(sfs_dict)
Esempio n. 2
0
    def WikiUpdate(self, wikipage):
        import Keyword
        re_firstchar = re.compile(r'(\w)', re.UNICODE).search

        # signs of UTF8 mojibake
        if isinstance(wikipage,unicode) and u'\xc3' in wikipage:
            wikipage =  wikipage.encode('latin-1').decode('utf-8')
        import Wiki
        wikimgr = Wiki.WikiManager()
        wikimgr.Download(wikipage)
        wikimgr.Parse(wikipage) 
        wikimgr.Relate(wikipage)
        if wikipage.startswith("SFS/"):
            import SFS
            sfsnr = wikipage.split("/",1)[1]
            sfsmgr = SFS.SFSManager()
            sfsmgr.config['sfs']['generate_force'] = 'True'
            sfsmgr.Generate(sfsnr)
        elif wikipage.startswith("Dom/"):
            pass
        elif ":" in wikipage: # not in default namespace, probably
            (namespace,localpage) = wikipage.split(":",1)
            if namespace == "Kategori":
                firstletter = re_firstchar(localpage).group(0)
                basefile = u'%s/%s' % (firstletter,localpage)
                kwmgr = Keyword.KeywordManager()
                kwmgr.config['keyword']['parse_force'] = 'True'
                kwmgr.config['keyword']['generate_force'] = 'True'
                kwmgr.Parse(basefile,wiki_keyword=True) # needed for new terms
                kwmgr.Generate(basefile)
            elif namespace == "Lagen.nu" and localpage == "Huvudsida":
                self._prep_frontpage()
                infile = Util.relpath(os.path.sep.join([__scriptdir__,'static','index.xht2']))
                outfile = Util.relpath(os.path.sep.join([self.baseDir,'site','generated','index.html']))
                print "in %s, out %s" % (infile,outfile)
                Util.transform(__scriptdir__+"/xsl/static.xsl", infile, outfile, validate=False, xinclude=True)
        else:
            firstletter = re_firstchar(wikipage).group(0)
            basefile = u'%s/%s' % (firstletter,wikipage)
            kwmgr = Keyword.KeywordManager()
            kwmgr.config['keyword']['parse_force'] = 'True'
            kwmgr.config['keyword']['generate_force'] = 'True'
            kwmgr.Parse(basefile,wiki_keyword=True) # needed for new terms
            # raise ValueError(repr(basefile))
            kwmgr.Generate(basefile)
Esempio n. 3
0
def create_sfs_dict(inpath, seq_length):
    """Creates Site-frequency spectrum from VCF

    Args:
        inpath (str): Path with multiple bgzipped VCFs

    Returns:
        sfs_dict (dict of str: list): Dictionary with simulation as keys and site-frequency spectrum as list value
    """
    print('Creating dictionary of Site Frequency Sprectra from VCFs in {0}'.
          format(inpath))

    # Initialize sfs dict
    sfs_dict = {}

    # Get files from inpath
    files = glob(inpath + '**/*.vcf.gz', recursive=True)

    total_vcfs = 0
    for vcf in tqdm(files):

        # Load in VCF file
        my_vcf = VCF(vcf)

        #     # Split VCF filename and split to extract population size, bottleneck and generation
        split_filename = vcf.split('/')[-1].split('_')
        N_sims = int(split_filename[0].split('N')[1])
        bot = float(split_filename[1].split('bot')[1])
        gen = int(split_filename[2].split('gen')[1].split('.')[0])
        # print(N_sims, bot, gen)
        N_samples = len(my_vcf.samples)

        # Initialize list for sfs
        sfs_list = [0] * ((2 * N_samples) + 1)

        # Add singletons, doubletons, ...n-tons
        for variant in my_vcf:
            AC = variant.INFO.get('AC')
            sfs_list[AC] += 1

        # Add invariant sites to sfs list
        sfs_list[0] = int(seq_length - sum(sfs_list))
        # print(sfs_list)

        # Use sfs list to instantiate SFS class (Rob's code)
        sfs = SFS.SFS(sfs_list)

        # Create string for dictionary key
        l1 = str(N_sims) + '-' + str(bot)

        # Add SFS class instances to appropriate dictionary keys.
        if l1 in sfs_dict:
            sfs_dict[l1][gen] = sfs
        else:
            sfs_dict[l1] = {gen: sfs}

        total_vcfs += 1

    print('Processed {0} VCFs'.format(total_vcfs))

    return sfs_dict
Esempio n. 4
0
    myargs = [arg.decode(coding) for arg in sys.argv]

    # ask for description and place it alongside

    # copy the modified file to a safe place
    file_to_patch = myargs[1].replace("\\","/") # normalize
    tmpfile = mktemp()
    copy2(file_to_patch,tmpfile)
        
    # Run SFSParser._extractSFST() (and place the file in the correct location)
    # or DVParser.word_to_docbook() 
    if "/sfs/intermediate/" in file_to_patch:
        source = "sfs"
        basefile = file_to_patch.split("/sfs/intermediate/")[1]
        import SFS
        p = SFS.SFSParser()
        sourcefile = file_to_patch.replace("/intermediate/", "/downloaded/sfst/").replace(".txt", ".html")
        print "source %s, basefile %s, sourcefile %s" % (source,basefile,sourcefile)
        plaintext = p._extractSFST([sourcefile])
        f = codecs.open(file_to_patch, "w",'iso-8859-1', errors="xmlcharrefreplace")
        f.write(plaintext+"\n")
        f.close()
        print "Wrote %s bytes to %s" % (len(plaintext), file_to_patch)

    elif "/dv/intermediate/docbook/" in file_to_patch:
        source = "dv"
        basefile = file_to_patch.split("/dv/intermediate/docbook/")[1]
        import DV
        p = DV.DVParser()
        sourcefile = file_to_patch.replace("/docbook/", "/word/").replace(".xml", ".doc")
        print "source %r, basefile %r, sourcefile %r" % (source,basefile,sourcefile)
Esempio n. 5
0
    posiciones_test = np.append(posiciones_test, pos_tipo[mitad:len(pos_tipo)])

# Ahora metemos en cuatro vectores, dos para datos y otros dos para clases, los que son para entrenamiento y los que son para test, según las
# posiciones que acabamos de obtener
datos_train = np.array([datos[i] for i in posiciones_train])
clases_train = np.array([clases[i] for i in posiciones_train])
datos_test = np.array([datos[i] for i in posiciones_test])
clases_test = np.array([clases[i] for i in posiciones_test])

# Creamos el knn
knnGPU = knnLooGPU(len(datos_train), len(datos_test), len(datos_train[0]), 3)

if args.algoritmo == 1:
    print("Greedy")
    com = time.time()
    mejores_car, tasa = SFS(clases_train, datos_train, knnGPU)
    fin = time.time()
elif args.algoritmo == 2:
    print("AM1")
    com = time.time()
    mejores_car, tasa = AM1(clases_train, datos_train, knnGPU)
    fin = time.time()
elif args.algoritmo == 3:
    print("AM2")
    com = time.time()
    mejores_car, tasa = AM2(clases_train, datos_train, knnGPU)
    fin = time.time()
elif args.algoritmo == 4:
    print("AM3")
    com = time.time()
    mejores_car, tasa = AM3(clases_train, datos_train, knnGPU)
Esempio n. 6
0
 def __init__(self):
     super(KeywordManager, self).__init__()
     # we use the display_title function
     import SFS
     self.sfsmgr = SFS.SFSManager()
Esempio n. 7
0
    def fit(self, X, y,n_trees, sample_mask=None, X_argsorted=None, check_input=True,
            sample_weight=None, topics = [], featuresToAdd=[], enrichment_proportion=2/3,threshold=0):
        
        """Build a decision tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. Use ``dtype=np.float32`` for maximum
            efficiency.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (integers that correspond to classes in
            classification, real numbers in regression).
            Use ``dtype=np.float64`` and ``order='C'`` for maximum
            efficiency.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        check_input : boolean, (default=True)
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Returns
        -------
        self : object
            Returns self.
        """
        
        random_state = check_random_state(self.random_state)

        # Deprecations
        if sample_mask is not None:
            warn("The sample_mask parameter is deprecated as of version 0.14 "
                 "and will be removed in 0.16.", DeprecationWarning)

        if X_argsorted is not None:
            warn("The X_argsorted parameter is deprecated as of version 0.14 "
                 "and will be removed in 0.16.", DeprecationWarning)

        # Convert data
        if check_input:
            X, = check_arrays(X, dtype=DTYPE, sparse_format="dense",
                              check_ccontiguous=True)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

	# prepare features in case of semantic classification	
	UsedFeatures = []
	#print self.n_features_
	EnrichmentMatrix = {}
	if (self.max_features == "semantic"):
		
		AllFeatures = np.arange(self.n_features_, dtype=np.int32)
		max_features = max(1, int(np.sqrt(self.n_features_)*enrichment_proportion))
		sqrt_features=max(1, int(np.sqrt(self.n_features_)))
		
		random_state.shuffle(AllFeatures)
		for i in range(max_features):
			UsedFeatures = UsedFeatures + SFS.findSemanticWord(AllFeatures[i],topics,threshold)
		
		UsedFeatures = list(set(UsedFeatures))
                


        elif (self.max_features == "distributed_semantic"):
		
        	numberFeatureToAdd = int(len(topics[2])/n_trees)
		
		if numberFeatureToAdd <1:
			numberFeatureToAdd = 1

		for topic in topics:
			random_state.shuffle(topic)
			for i in range(numberFeatureToAdd):
				UsedFeatures.append(topic[i])
		random_state.shuffle(featuresToAdd)
		featuresNotAssigned= int(len(featuresToAdd)/n_trees)
		for f in range(featuresNotAssigned):
			UsedFeatures.append(featuresToAdd[f])
	 		
	elif (self.max_features == "semantic_node"):
		
		AllFeatures = np.arange(self.n_features_, dtype=np.int32)
				
		for i in range(self.n_features_):
			EnrichmentMatrix[i] = list(set(SFS.findSemanticWord(AllFeatures[i],topics,threshold)))
		
		       
        y = np.atleast_1d(y)
        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            for k in xrange(self.n_outputs_):
                classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = (2 ** 31) - 1 if self.max_depth is None else self.max_depth

        if isinstance(self.max_features, six.string_types):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
	    elif self.max_features == "semantic":
		max_features = max(1, int(np.sqrt(self.n_features_)))
	    elif self.max_features == "semantic_node":
		max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "distributed_semantic":
		max_features = max(1, int(np.sqrt(self.n_features_)))
	    elif self.max_features == "distributed_semantic_node":
		max_features = max(1, int(np.sqrt(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if self.min_samples_split <= 0:
            raise ValueError("min_samples_split must be greater than zero.")
        if self.min_samples_leaf <= 0:
            raise ValueError("min_samples_leaf must be greater than zero.")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if sample_weight is not None:
            if (getattr(sample_weight, "dtype", None) != DOUBLE or
                    not sample_weight.flags.contiguous):
                sample_weight = np.ascontiguousarray(
                    sample_weight, dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more "
                                 "than one dimension: %d" %
                                 len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError("Number of weights=%d does not match "
                                 "number of samples=%d" %
                                 (len(sample_weight), n_samples))

        # Set min_samples_split sensibly
        min_samples_split = max(self.min_samples_split,
                                2 * self.min_samples_leaf)

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion,
                                                max_features,
                                                self.min_samples_leaf,
                                                random_state)

        self.criterion_ = criterion
        self.splitter_ = splitter
        self.tree_ = Tree(self.n_features_, self.n_classes_,
                          self.n_outputs_, splitter, max_depth,
                          min_samples_split, self.min_samples_leaf,
                          random_state)

	if (self.max_features == "semantic"):        
		self.tree_.build(X, y, FeaturesToUse = UsedFeatures, sample_weight=sample_weight)
	elif (self.max_features == "semantic_node"):
		self.tree_.build(X, y, EnrichmentMatrix = EnrichmentMatrix, sample_weight=sample_weight)
	elif (self.max_features == "distributed_semantic"):
		self.tree_.build(X, y, FeaturesToUse = UsedFeatures, sample_weight=sample_weight)
	elif (self.max_features == "distributed_semantic_node"):
		self.tree_.build(X, y, topicsToUse =topics, featuresNotAssToUse=featuresToAdd , sample_weight=sample_weight)
	else:
		self.tree_.build(X, y, sample_weight=sample_weight)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self