Example #1
0
def pipeline(args):
    pd.options.mode.chained_assignment = None

    # Sanity check args
    if not parent_dir_exists(args.out):
        raise ValueError('--out flag points to an invalid path.')

    print('Preparing files for analysis...')
    gwas_snps, N1, N2 = prep(args.bfile, args.chr, args.start, args.end,
                             args.sumstats1, args.sumstats2, args.N1, args.N2)
    m = len(gwas_snps)
    df = pd.DataFrame(
        OrderedDict({
            "chr": [args.chr],
            "start": [args.start],
            "end": [args.end],
            "m": [m],
            "N1": [N1],
            "N2": [N2]
        }))
    convert_dict = {
        "chr": int,
        "start": int,
        "end": int,
        "m": int,
        "N1": int,
        "N2": int
    }
    out = df.astype(convert_dict)
    out.to_csv(args.out, sep=' ', na_rep='NA', index=False)
def pipeline(args):
    pd.options.mode.chained_assignment = None

    # Sanity check args
    if not parent_dir_exists(args.out):
        raise ValueError('--out flag points to an invalid path.')

    print('Preparing files for analysis...')
    gwas_snps, reversed_alleles_ref, bed, N1, N2 = prep(args.bfile1, args.bfile2, args.partition, args.sumstats1, args.sumstats2, args.N1, args.N2)
    print('Calculating LD scores for the first population...')
    ld_scores1 = ldscore(args.bfile1, gwas_snps)
    print('Calculating LD scores for the second population...')
    ld_scores2 = ldscore(args.bfile2, gwas_snps)
    ld_scores1 = ld_scores1[ld_scores1['SNP'].isin(ld_scores2['SNP'])]
    ld_scores2 = ld_scores2[ld_scores2['SNP'].isin(ld_scores1['SNP'])]
    subset_index = gwas_snps['SNP'].isin(ld_scores1['SNP'])
    gwas_snps = gwas_snps[subset_index]
    reversed_alleles_ref = reversed_alleles_ref[subset_index]
    print('{} SNPs included in our analysis...'.format(len(gwas_snps)))
    #print('Calculating heritability...')
    #h_1, h_2 = heritability(gwas_snps, ld_scores1, ld_scores2, N1, N2)
    #print('The genome-wide heritability of the first trait is {}.\nThe genome-wide heritability of the second trait is {}.'.format(h_1, h_2))
    print('Calculating local genetic covariance...')
    out = calculate(args.bfile1, args.bfile2, bed, args.thread, gwas_snps, reversed_alleles_ref, N1, N2, args.genome_wide, ld_scores1, ld_scores2)
    out.to_csv(args.out, sep=' ', na_rep='NA', index=False)
Example #3
0
def pipeline(args):
    pd.options.mode.chained_assignment = None

    # Sanity check args
    if args.save_ld is not None and args.use_ld is not None:
        raise ValueError('Both the --save-ld and --use-ld flags were set. '
                         'Please use only one of them.')
    if args.save_ld is not None:
        if not parent_dir_exists(args.save_ld):
            raise ValueError('--save-ld flag points to an invalid path.')
    if args.use_ld is not None:
        if not os.path.exists(args.use_ld + '.csv.gz'):
            raise ValueError('--use-ld flag points to an invalid path.')
    if not parent_dir_exists(args.out):
        raise ValueError('--out flag points to an invalid path.')

    print('Preparing files for analysis...')
    gwas_snps, N1, N2, annots = prep(args.bfile, args.annot, args.sumstats1,
                                     args.sumstats2)
    if args.N1 is not None:
        N1 = args.N1
    if args.N2 is not None:
        N2 = args.N2
    if args.use_ld is not None:
        print('Loading LD scores from {}'.format(args.use_ld))
        ld_scores = pd.read_csv(args.use_ld + '.csv.gz', sep=' ')
    else:
        print('Calculating LD scores...')
        ld_scores = ldscore(args.bfile, annots, gwas_snps, args.save_ld)
    print('Calculating correlation...')
    out = calculate(gwas_snps, ld_scores, annots, N1, N2)
    print('\nFinal results:\n{}\n'.format(out))
    print('\nView ldsc.log for verbose output.')
    out.insert(0, 'annot_name', out.index)
    out.to_csv(args.out, sep=' ', na_rep='NA', index=False)
Example #4
0
def pipeline(args):
    pd.options.mode.chained_assignment = None

    # Sanity check args
    if not parent_dir_exists(args.out):
        raise ValueError('--out flag points to an invalid path.')

    print('Preparing files for analysis...')
    gwas_snps, bed, N1, N2 = prep(args.bfile, args.partition, args.sumstats1,
                                  args.sumstats2, args.N1, args.N2)
    print('Calculating LD scores...')
    ld_scores = ldscore(args.bfile, gwas_snps)
    gwas_snps = gwas_snps[gwas_snps['SNP'].isin(ld_scores['SNP'])]
    print('{} SNPs included in our analysis...'.format(len(gwas_snps)))
    print('Calculating heritability...')
    h_1, h_2 = heritability(gwas_snps, ld_scores, N1, N2)
    print(
        'The genome-wide heritability of the first trait is {}.\nThe genome-wide heritability of the second trait is {}.'
        .format(h_1, h_2))
    print('Calculating phenotypic correlation...')
    pheno_corr, pheno_corr_var = pheno(gwas_snps, ld_scores, N1, N2, h_1, h_2)
    print('Calculating local genetic covariance...')
    out = calculate(args.bfile, bed, args.thread, gwas_snps, ld_scores, N1, N2,
                    pheno_corr, pheno_corr_var)
    out.to_csv(args.out, sep=' ', na_rep='NA', index=False)
Example #5
0
 def scan_doc(self, doc):
     #sometimes it would hit us with one of these, so we added this so it wouldn't
     try:
         body = open(self.domain + '/' + doc, 'r').read()
     except UnicodeDecodeError as e:
         return
     else:
         self.totaldocs += 1
         #extract url from document
         thing = re.search(r'URL:(.*)\n', body)
         if thing == None:
             url = 'URL NOT FOUND'
         else:
             url = thing.group(1)
         self.doc2url[doc] = url
         #get actual displayed text, uses beautifulsoup, we never got it working nearly as well without it, sorry
         text = text_from_html(body)
         words = prep.prep(text)
         #get all those juicy words crammed into our dictionary for safekeeping and eventual analysis
         for w in words:
             if w not in self.d.keys():
                 self.d[w] = {}
                 self.d[w]['<total>'] = 1.0
                 self.d[w]['<df>'] = 1.0
                 self.d[w][doc] = 1.0
             elif doc not in self.d[w].keys():
                 self.d[w][doc] = 1.0
                 self.d[w]['<total>'] += 1.0
                 self.d[w]['<df>'] += 1.0
             else:
                 self.d[w][doc] += 1.0
                 self.d[w]['<total>'] += 1.0
Example #6
0
def pipeline(args):
    pd.options.mode.chained_assignment = None

    # Sanity check args
    if not parent_dir_exists(args.out):
        raise ValueError('--out flag points to an invalid path.')

    print('Preparing files for analysis...')
    gwas_snps, N1, N2 = prep(args.bfile, args.chr, args.start, args.end,
                             args.sumstats1, args.sumstats2, args.N1, args.N2)
    print('Calculating local TWAS...')
    out = calculate(args.bfile, gwas_snps, N1, N2, args.h1, args.h2,
                    args.shrinkage)
    out.to_csv(args.out, sep=' ', na_rep='NA', index=False)
Example #7
0
 def searchDom(self):
     self.searchresults.delete(0,'end')
     fullDomain = self.fullDomainbox.get()
     searchterms = self.searchtermsbox.get()
     if searchterms == '':
         print("no search term")
     else:
         searchterms = prep.prep(searchterms)
         print("The button was pressed! Requesting a search for {0} on {1}".format(searchterms,fullDomain))
         #Have both a domain and a search term.
         #Check if folder for the domain exists
         regex = re.compile(r'https?:\/\/(www\.)?(.+)\.(.+)$')
         baseurl = regex.match(fullDomain).group(2) # for baseurl https://www.muhlenberg.edu this returns "muhlenberg"
         if not os.path.exists(baseurl):
             #make directory for the url if it doesnt
             print("Folder did not exist making Folder")
             os.makedirs(baseurl)
         #Try to open a reverse index
         if not os.path.exists('dicts'):
             os.makedirs('dicts')
         try:
             with open('dicts/' + baseurl + '.pkl', 'rb') as f:
                 ri = pickle.load(f)
         except FileNotFoundError as e0:
             #rev index doesnt exist, check for crawled files
             try:
                 open('{0}/{0}page#0.txt'.format(baseurl),'r')
             except FileNotFoundError as e1:
                 #crawled files dont exist: crawl!
                 crwl = crawler()
                 crwl.crawl(fullDomain,100)
                 ri = r_index(baseurl).rind
                 results = search.retrieve(searchterms,ri)
                 if len(results)>5:
                     for i in range(5):
                         self.searchresults.insert(0,results[i][0])
                 else:
                     for result in results:
                         self.searchresults.insert(0,result[0])
             else:
                 ri = r_index(baseurl).rind
                 #Crawler files exist!
     results = search.retrieve(searchterms,ri)
     if len(results)>5:
         for i in range(5):
             self.searchresults.insert(0,results[i][0])
     else:
         for result in results:
             self.searchresults.insert(0,result[0])
Example #8
0
def main():
    # load GUI, w input for domain and search term
    # if domain is not on file, ask user if they'd like to make a file.
    # if they would, crawl us a new database

    domain = input("What domain are we searchin?: ")
    try:
        open('dicts/{0}.pkl'.format(domain), 'r')
    except FileNotFoundError as e0:
        choice0 = input('reverse index file does not exist, make file? (y/n) ')
        if choice0 == 'y':
            try:
                open('{0}/{0}page#0.txt'.format(domain), 'r')
            except FileNotFoundError as e:
                decide = input('data does not exist, make new folder? (y/n) ')
                if decide == 'y':  #make directory for this domain
                    domaincrawl = crawler()
                    domaincrawl.crawl('http://www.' + domain + '.edu', 100)
                    print('creating reverse index...')
                    ri = r_index(domain)
                    if 'javascript' in ri.d.keys():
                        print('its not filtering out javascript yet')
                    else:
                        print('no problem, ending program!')
            else:
                print('creating reverse index...')
                ri = r_index(domain)
                #print(ri.toString())
                with open('dicts/' + domain + '.pkl', 'rb') as f:
                    dom_ind = pickle.load(f)
                if 'javascript' in ri.d.keys():
                    print('its not filtering out javascript yet')

        else:
            print('no worries, come back another time.')
    else:
        with open('dicts/' + domain + '.pkl', 'rb') as f:
            dom_ind = pickle.load(f)
        print('index loaded')

    query = input('what we searchin?: ')
    wordsin = prep.prep(query)
    retrieve(wordsin, dom_ind)
Example #9
0
    def makeReport(self):

        dfTeamMap = pd.read_excel(self.listOfConfigFile[0], 'Sheet1')

        self.dfMainReport = self.dfMainReport.reset_index(drop=True)
        p = pr.prep()
        p.addLocation(self.dfMainReport)
        p.NextTaskExpected(self.dfMainReport)
        self.dfMainReport = p.change(self.dfMainReport)
        # py.saveExcel(self.listOfConfigFile[2],"Sheet1",self.dfMainReport)
        py.saveExcel(
            "C:\\Users\\prerna.prakash\\Desktop\\Input\\checkthis.xlsx",
            "Sheet1", self.dfMainReport)
        self.dfMainReport = py.vlookupin(self.dfMainReport, dfTeamMap,
                                         'LAST_TASK', 'TASK', ['TEAM'])
        MA = ma.mobileAge(self.dfMainReport)
        self.dfMainReport = MA.Main()
        self.makeTriageSheet()

        self.makeTriage()
Example #10
0
from MEMM import MEMM
from prep import prep

prepocessing = prep('../Project2_resources/train.txt')
data = prepocessing.pre_process_memm()
#

memm_classifier = MEMM(data, "validation")
memm_classifier.trainMEMM(True)

test_prepocessing = prep('../Project2_resources/validation.txt')
test_words = test_prepocessing.pre_process_memm_test()
my_prep_test = prep('../Project2_resources/validation.txt')
data_test = my_prep_test.pre_process_hmm_test()

tags = []
words = []


def findTheResult(numberList, tagList):
    templeDict = {'PER': [], 'LOC': [], 'ORG': [], 'MISC': [], 'O': []}
    start_num = numberList[0]
    prev_number = numberList[0]
    number = 0
    prev_tag = None
    for j in range(len(tagList)):
        tag = tagList[j]
        if 'PER' in tag and prev_tag != 'PER':
            if prev_tag != None:
                templeDict[prev_tag].append([start_num, prev_number])
            start_num = number
Example #11
0
lines01 = sc.textFile(date)
lines02 = sc.textFile(date2)

#add schema
traces01 = readIn(lines01, '\t')
traces02 = readIn(lines02, '\t')

#make them into dataframe
df01 = sqlContext.createDataFrame(traces01)
df02 = sqlContext.createDataFrame(traces02)

#convert them to vectors
df_conv01 = convDf(df01)

#prepare for ml
df_prepped01 = prep(df_conv01)
df_prepped02 = df02.select("name").distinct()

#function to apply labels
df_labeled = get_labels(df_prepped01, df_prepped02)
df_labeled = df_labeled.na.drop().drop("version_idx")
cols_for_ml = df_prepped01.drop("name").drop("version_idx").schema.names

#pipline stages
#index the label
labelIndexer = mlf.StringIndexer(inputCol="Label", outputCol="Label_idx")
#vectorise the input
toVec = mlf.VectorAssembler(inputCols=cols_for_ml, outputCol="Features")
#classify
classifier = DecisionTreeClassifier(labelCol="Label_idx",
                                    featuresCol="Features",
Example #12
0
from prep import prep
from HMM import HMM

my_prep = prep('../Project2_resources/new_train.txt')
data = my_prep.pre_process_hmm()
my_prep_test = prep('../Project2_resources/validation.txt')
data_test = my_prep_test.pre_process_hmm()
correct = 0
sum = 0
baseline = my_prep.generate_baseline()
for i in range(len(data_test[3])):
    for j in range(len(data_test[3][i])):
        sum += 1
        tag = ''
        if data_test[0][i][j] in baseline:
            tag = baseline[data_test[0][i][j]]
        else:
            tag = 'O'
        if tag == data_test[3][i][j]:
            correct = correct + 1

print correct
print sum
accuracy = float(correct) / float(sum)
print accuracy
Example #13
0
 def onRun(self):
     if self.gtfs_mode.get() :
         prep.prep()
     if not (self.images.get() == 1) : self.images_every.set(0)
     mds.MDS(self.od_directory.get(), self.dimensions.get(), self.n_iterations.get(), self.images_every.get(), self.kernel_chunk.get(), self.list_size.get(), self.debug.get(), self).start()
Example #14
0
from prep import prep
from HMM import HMM
import csv

my_prep = prep('../Project2_resources/train.txt')
#data = my_prep.pre_process_hmm()
my_prep_test = prep('../Project2_resources/test.txt')
data_test = my_prep_test.pre_process_hmm_test()

baseline = my_prep.generate_baseline()
tags = []
for i in range(len(data_test[0])):
    tag_line = []
    for j in range(len(data_test[0][i])):
        tag = ''
        if data_test[0][i][j] in baseline:
            tag = baseline[data_test[0][i][j]]
        else:
            tag = 'O'
        tag_line.append(tag)
    tags.append(tag_line)

dict = {'PER': '', 'LOC': '', 'ORG': '', 'MISC': '', 'O': ''}
prev_tag = None
prev_number = 0
start_num = 0

for i in range(len(tags)):
    tags_line = tags[i]
    numbers_line = data_test[2][i]
    for j in range(len(tags_line)):
Example #15
0
def main(mode):
    if mode == 'prep':
        prep()
    if mode == 'train':
        train()
Example #16
0
def both(pth):
    prep(pth)
    compile(pth)