Esempio n. 1
0
    def simulate_false_candidates(self, eloci, FCR=0):
        """
            Simulate the effects of False Candidate Rates (MCR)

            Parameters
            ----------
            eloci : (iterable of loci objects)
                Set of true starting loci 
            FCR : (float - default = 0)
                False candidate rate. Can provide either a 
                floating point percentage or a whole number.
                (i.e. method will convert 30 <-> 0.30)

            Returns
            -------
            an iterable of loci
        """
        # Convert between percentage and float
        if FCR < 1 and FCR > 0:
            FCR = FCR * 100
        # FCR: Replace a percentage of SNPs with false positives
        if FCR > 0:
            # replace some loci with random genes if FDR specified
            num_fcr = math.ceil(len(eloci) * (FCR / 101))
            fcr_loci = self.cob.refgen.random_genes(
                num_fcr, window=self.args.candidate_window_size)
            log(
                "Simulating {}% of SNPs as false positive -> adding {} SNPs",
                FCR,
                len(fcr_loci),
            )
            # permute and truncate the loci then add fcr loci
            eloci = np.concatenate([eloci, np.array(list(fcr_loci))])
        return eloci
Esempio n. 2
0
    def simulate_missing_candidates(self, eloci, MCR=0):
        """
            Simulate the effects of Missing Candidate Rates (MCR)

            Parameters
            ----------
            eloci : (iterable of loci objects)
                Set of true starting loci 
            MCR : (float - default = 0)
                Missing candidate rate. Can provide either a 
                floating point percentage or a whole number.
                (i.e. method will convert 30 <-> 0.30)

            Returns
            -------
            an iterable of loci
        """
        # Convert between percentage and float
        if MCR < 1 and MCR > 0:
            MCR = MCR * 100
        # MCR: Remove a percentage of SNPs to simulate false negatives
        if MCR > 0:
            # Calulate the index needed to hit percent missing
            missing_index = math.ceil(len(eloci) * (1 - (MCR / 100)))
            if missing_index < 2:
                missing_index = 2
            new_eloci = np.random.permutation(eloci)[0:missing_index]
            log(
                "Simulating {}% of SNPs missed by GWAS ({} SNPs -> {})",
                MCR,
                len(eloci),
                len(new_eloci),
            )
            eloci = new_eloci
        return eloci
Esempio n. 3
0
 def generate_bootstraps(self, loci, overlap):
     """
         Bootstrapping procedure. Our target here is to provide enough bootstraps
         to identify loci that are significant at n==1000 bootstraps. The auto 
         procedure will continue untill we meet n==1000 OR we find 50 bootstraps
         that are have higher score such that we will never be significant at 1000
         boostraps (0.05 * 1000 = 50).
     """
     target_score = overlap.score.mean()
     max_bs = 1000
     num_bs = 0
     bs = []
     if self.args.num_bootstraps == "auto":
         # Create a bullshit generator... err bootstraps
         bs_generator = (self.overlap(loci, bootstrap=True, iter_name=x)
                         for x in range(max_bs))
         while num_bs <= 50 and len(bs) < 1000:
             # Add 50 bootstraps to current bootstraps
             bs = [next(bs_generator) for x in range(50)] + bs
             # Find the number of bs more extreme than the empirical score
             num_bs = sum([df.score.mean() >= target_score for df in bs])
             log(
                 "Iteration: {} -- current pval: {} {}% complete",
                 len(bs),
                 num_bs / len(bs),
                 max(len(bs) / 10, 100 * (num_bs / 50)),
             )
     else:
         # Be a lil broke back noodle and explicitly bootstrap
         bs = [
             self.overlap(loci, bootstrap=True, iter_name=x)
             for x in range(int(self.args.num_bootstraps))
         ]
     return pd.concat(bs)
Esempio n. 4
0
 def from_file(cls,filename,normalize=True):
     self = cls() 
     with open(filename,'r') as IN:
         in_data_table = False
         cur_soft = None
         cur_data = list()
         for i,line in enumerate(IN):
             line = line.strip()
             if line.startswith('^'):
                 if cur_soft: # Add the filled SOFT to Family
                     if cur_soft.type == 'Sample':
                         if cur_soft.is_raw() and normalize:
                             log("Normalizing {}",cur_soft.name)
                             cur_soft.transform()
                         self.samples.append(cur_soft)
                     else:
                         setattr(self,cur_soft.type.lower(),cur_soft)
                 # WE have a new SOFT
                 type,name = line.replace('^','').replace(' = ','=').split('=',1)
                 type = type.lower().capitalize()
                 if type == 'Series':
                     cur_soft = Series(name)
                 elif type == 'Sample':
                     cur_soft = Sample(name)
                 elif type == 'Platform':
                     cur_soft = Platform(name)
                 else:
                     cur_soft = Soft(name,type=type.lower().capitalize())
                 cur_data = list()
             elif line.startswith('!') and 'table_begin' in line:
                 in_data_table = True
             elif line.startswith('!') and 'table_end' in line:
                 in_data_table = False
                 # Create DataFrame and append to SOFT
                 cur_headers = cur_data.pop(0)
                 cur_soft.tbl = pd.DataFrame.from_records(data=cur_data,columns=cur_headers)
                 cur_soft.tbl.index = cur_soft.tbl.icol(0)
                 # Turn -Inf into NaNs
                 cur_soft.tbl[cur_soft.tbl == float('-Inf')]  = np.nan
                 cur_data = list()
             elif line.startswith("!"):
                 # add info to 
                 key,val = map(str.strip,line.replace('!'+cur_soft.type+'_','').split('=',1))
                 cur_soft.update_info(key,val)
             elif line.startswith('#'):
                 # Columns descriptions
                 cur_soft.headers.append(line)
             elif in_data_table:
                 cur_data.append(line.replace('"','').split('\t'))
         return self
Esempio n. 5
0
 def effective_snps(self,window_size=None,max_genes_between=1):
     ''' 
         Collapse down loci that have overlapping windows.
         Also collapses down snps that have 
     '''
     locus_list = sorted(self.locus_list)
     if window_size is not None:
         for locus in locus_list:
             locus.window = window_size
     collapsed = [locus_list.pop(0)]
     for locus in locus_list:
         # if they have overlapping windows, collapse
         if locus in collapsed[-1]:
             # Collapse if the windows overlap
             collapsed[-1] = collapsed[-1] + locus
         else:
             collapsed.append(locus)
     log('{}: Found {} SNPs -> {} effective SNPs',self.name,len(self.locus_list),len(collapsed))
     return collapsed
Esempio n. 6
0
    def create(cls,name,description,type='Camoco'):
        '''
            This is a class method to create a new camoco type object.
            It initializes base directory hierarchy 
        '''
        basedir = os.path.realpath(
            os.path.expanduser(cf.get('options','basedir'))
        )

        # Create the basedir if not exists
        try:    
            os.makedirs(basedir,exist_ok=True)
            os.makedirs(os.path.join(basedir,"logs"),exist_ok=True)
            os.makedirs(os.path.join(basedir,"databases"),exist_ok=True)
            os.makedirs(os.path.join(basedir,"analyses"),exist_ok=True)
            os.makedirs(os.path.join(basedir,"tmp"),exist_ok=True)
        except Exception as e:
            log(' Could not create files in {}',basedir)
            raise
        try:
        # Create the base camoco database
            lite.Connection(
                os.path.join(basedir,'databases','Camoco.Camoco.db')
            ).cursor().execute(''' 
                CREATE TABLE IF NOT EXISTS datasets (
                    name TEXT NOT NULL,
                    description TEXT,
                    type TEXT,
                    added datetime DEFAULT CURRENT_TIMESTAMP,
                    PRIMARY KEY(name,type)
                );
                INSERT OR IGNORE INTO datasets (name,description,type)
                VALUES ('Camoco','Camoco base','Camoco');
                INSERT OR FAIL INTO datasets (name,description,type)
                VALUES (?,?,?)''',(name,description,type)
            )
        except ConstraintError as e:
            log.warn('CAUTION! {}.{} Database already exists.',name,type)
        self = cls(name) 
        return self
Esempio n. 7
0
 def _guess_groups(dataframe,max_r2=0.99,max_namediff=0.8):
     ''' Given a data frame, this method checks to see that each column has a correlation
         below the max_r2. If it is above, a new column is created using the mean.'''
     # Calculate correlation
     cors = dataframe.corr()
     # Each column starts in its own group
     column_groups = list(range(0,len(cors)))
     # Iterate over upper triangular, this guarantees that 
     # we dont overwrite lower numbered groups
     #
     #   #OOOOOOOOOOO   <- group
     #   i              <- row number (along d)
     #  -------------   <- matrix
     #  |d   x    x     <- row (x means r2 > max)
     #  | d
     #  |  d
     #  |   d
     #  |    d
     for i,row in enumerate(np.triu(cors.as_matrix(),k=1)):
         if any(row > max_r2):
             # higher numbered columns are highly correlated with current column
             which = np.where(row > max_r2)[0]
             from difflib import SequenceMatcher
             for match in which:
                 diffratio = SequenceMatcher(None,dataframe.columns[i],dataframe.columns[match]).ratio()
                 # if column group is already assigned, keep assignment
                 if column_groups[i] != i and diffratio > max_namediff:
                     # Here, samples have high expression correlation AND similar names (i.e. rep1 vs rep2)
                     group = column_groups[i]
                     log("{} is {} correlated with {}",
                         dataframe.columns[i],
                         diffratio,
                         dataframe.columns[match]
                     )
                 else:
                     # Otherwise start your own group
                     group = i
                 for x in which:
                     column_groups[x] = group
     return column_groups
Esempio n. 8
0
    def create(cls, name, description, type='Camoco'):
        '''
            This is a class method to create a new camoco type object.
            It initializes base directory hierarchy 
        '''
        basedir = os.path.realpath(
            os.path.expanduser(cf.get('options', 'basedir')))
        # Create the basedir if not exists

        try:
            os.makedirs(basedir, exist_ok=True)
            os.makedirs(os.path.join(basedir, "logs"), exist_ok=True)
            os.makedirs(os.path.join(basedir, "databases"), exist_ok=True)
            os.makedirs(os.path.join(basedir, "analyses"), exist_ok=True)
            os.makedirs(os.path.join(basedir, "tmp"), exist_ok=True)
        except Exception as e:
            log(' Could not create files in {}', basedir)
            raise
        try:
            # Create the base camoco database
            lite.Connection(
                os.path.join(basedir, 'databases',
                             'Camoco.Camoco.db')).cursor().execute(
                                 ''' 
                CREATE TABLE IF NOT EXISTS datasets (
                    name TEXT NOT NULL,
                    description TEXT,
                    type TEXT,
                    added datetime DEFAULT CURRENT_TIMESTAMP,
                    PRIMARY KEY(name,type)
                );
                INSERT OR IGNORE INTO datasets (name,description,type)
                VALUES ('Camoco','Camoco base','Camoco');
                INSERT OR FAIL INTO datasets (name,description,type)
                VALUES (?,?,?)''', (name, description, type))
        except ConstraintError as e:
            log.warn('CAUTION! {}.{} Database already exists.', name, type)
        self = cls(name)
        return self
Esempio n. 9
0
 def __init__(self, name, type='Camoco', basedir="~/.camoco"):
     # Set up our base directory
     self.log = log()
     # A dataset already exists, return it
     self.db = self._database(".".join([type, name]))
     (self.ID, self.name, self.description, self.type,
      self.added) = self._database('Camoco.Camoco').cursor().execute(
          "SELECT rowid,* FROM datasets WHERE name = ? AND type = ?",
          (name, type)).fetchone()
     cur = self.db.cursor()
     cur.execute('''
         CREATE TABLE IF NOT EXISTS globals (
             key TEXT,
             val TEXT
         );
         CREATE UNIQUE INDEX IF NOT EXISTS uniqkey ON globals(key)
         ''')
Esempio n. 10
0
 def __init__(self,name,type='Camoco',basedir="~/.camoco"):
     # Set up our base directory
     self.log = log()
     # A dataset already exists, return it
     self.db = self._database(".".join([type,name]))
     (self.ID,self.name,self.description,
         self.type,self.added) = self._database('Camoco.Camoco').cursor().execute(
         "SELECT rowid,* FROM datasets WHERE name = ? AND type = ?",
         (name,type)
     ).fetchone()
     cur = self.db.cursor()  
     cur.execute('''
         CREATE TABLE IF NOT EXISTS globals (
             key TEXT,
             val TEXT
         );
         CREATE UNIQUE INDEX IF NOT EXISTS uniqkey ON globals(key)
         ''')
Esempio n. 11
0
 def wget(id,force=False):
     ''' Downloads the GEO series from the internets into PWD'''
     if os.path.exists("{}_family.soft.gz".format(id)) and force == False:
         log("{} already exists",id)
         return
     try:
         log("Fetching {}",id)
         gse = urllib.request.urlretrieve(
             "ftp://ftp.ncbi.nlm.nih.gov/geo/series/{}nnn/{}/soft/{}_family.soft.gz".format(id[0:len(id)-3],id,id),
             "{}_family.soft.gz".format(id)
         )
     except Exception as e:
         log("Could not download {}",id)
Esempio n. 12
0
 def __init__(self,name,type='Camoco',basedir="~/.camoco"):
     # Set up our base directory
     self.log = log()
     self.type = type
     # A dataset already exists, return it
     self.db = self._database(name)
     try:
         (self.ID,self.name,self.description,self.type,self.added) = \
         self._database('Camoco',type='Camoco') \
             .cursor().execute(
             "SELECT rowid,* FROM datasets WHERE name = ? AND type = ?",
             (name,type)
         ).fetchone()
         cur = self.db.cursor()  
         cur.execute('''
             CREATE TABLE IF NOT EXISTS globals (
                 key TEXT,
                 val TEXT
             );
             CREATE UNIQUE INDEX IF NOT EXISTS uniqkey ON globals(key)
             ''')
     except TypeError as e:
         raise TypeError('{}.{} does not exist'.format(type,name))
Esempio n. 13
0
def snp2gene(args):
    '''
        Perform SNP (locus) to candidate gene mapping
    '''

    if args.out != sys.stdout:
        # Create any non-existant directories
        if os.path.dirname(args.out) != '':
            os.makedirs(os.path.dirname(args.out),exist_ok=True)
        if os.path.exists(args.out) and not args.force:
            print(
                "Output for {} exists! Skipping!".format(
                    args.out
                ),file=sys.stderr
            )
            return None

    # Set a flag saying this is from a COB refgen
    from_cob = False
    # Create the refgen (option to create it from a COB)
    if co.Tools.available_datasets('Expr',args.refgen):
        refgen = co.COB(args.refgen).refgen
        from_cob = args.refgen 
    elif co.Tools.available_datasets('RefGen',args.refgen):
        refgen = co.RefGen(args.refgen)
    # Create the GWAS object
    ont = co.GWAS(args.gwas)

    if 'all' in args.terms:
        terms = ont.iter_terms()
    else:
        terms = [ont[term] for term in args.terms]

    data = pd.DataFrame()
    results = []
    for term in terms:
        for window_size in args.candidate_window_size:
            for flank_limit in args.candidate_flank_limit:
                if 'effective' in args.snp2gene:
                    # Map to effective
                    effective_loci = term.effective_loci(
                        window_size=window_size
                    )
                elif 'strongest' in args.snp2gene:
                    effective_loci = term.strongest_loci(
                        window_size=window_size,
                        attr=args.strongest_attr,
                        lowest=args.strongest_higher
                    )
                genes = pd.DataFrame([ x.as_dict() for x in 
                    refgen.candidate_genes(
                        effective_loci,
                        flank_limit=flank_limit,
                        include_parent_locus=True,
                        include_num_siblings=True,
                        include_num_intervening=True,
                        include_rank_intervening=True,
                        include_SNP_distance=True,
                        include_parent_attrs=args.include_parent_attrs,
                        attrs={'Term':term.id},
                    )
                ])
                genes['FlankLimit'] = flank_limit
                genes['WindowSize'] = window_size
                genes['RefGen'] = refgen.name
                if from_cob != False:
                    genes['COB'] = from_cob
                data = pd.concat([data,genes])

    # Add data from gene info files
    original_number_genes = len(data)
    for info_file in args.gene_info:
        log('Adding info for {}',info_file)
        # Assume the file is a table
        info = pd.read_table(info_file,sep='\t')
        if len(info.columns) == 1:
            info = pd.read_table(info_file,sep=',')
        # try to match as many columns as possible
        matching_columns = set(data.columns).intersection(info.columns)
        log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns))
        data = pd.merge(data,info,how='left')
        if len(data) != original_number_genes:
            log.warn(
                'There were multiple info rows for some genes. '
                'Beware of potential duplicate candidate gene entries! '
            )
    
    # Generate the output file
    data.to_csv(args.out,index=None,sep='\t')

    log("Summary stats")
    print('-'*100)
    #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit))
    print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique())))
    print("Number of candidate genes per term:")
    print(data.groupby('Term').apply(lambda df: len(df.ID)))
Esempio n. 14
0
import sys
import os

import pandas as pd
import numpy as np
import scipy as sp
import camoco as co

from itertools import chain

from camoco.Tools import log
# Initialize a new log object
log = log()

def snp2gene(args):
    '''
        Perform SNP (locus) to candidate gene mapping
    '''

    if args.out != sys.stdout:
        # Create any non-existant directories
        if os.path.dirname(args.out) != '':
            os.makedirs(os.path.dirname(args.out),exist_ok=True)
        if os.path.exists(args.out) and not args.force:
            print(
                "Output for {} exists! Skipping!".format(
                    args.out
                ),file=sys.stderr
            )
            return None
Esempio n. 15
0
import sys
import os

import pandas as pd
import numpy as np
import scipy as sp
import camoco as co

from itertools import chain

from camoco.Tools import log
# Initialize a new log object
log = log()

def snp2gene(args):
    '''
        Perform SNP (locus) to candidate gene mapping
    '''

    if args.out != sys.stdout:
        # Create any non-existant directories
        if os.path.dirname(args.out) != '':
            os.makedirs(os.path.dirname(args.out),exist_ok=True)
        if os.path.exists(args.out) and not args.force:
            print(
                "Output for {} exists! Skipping!".format(
                    args.out
                ),file=sys.stderr
            )
            return None
Esempio n. 16
0
    def from_CLI(cls, args):
        """
            Implements an interface to the CLI to perform GWAS simulation
        """
        self = cls()
        # Build the base objects
        self.args = args
        # Load camoco objects
        self.go = co.GOnt(self.args.GOnt)
        self.cob = co.COB(self.args.cob)
        self.generate_output_name()

        # Generate an iterable of GO Terms
        if "all" in self.args.terms:
            # Create a list of all terms within the size specification
            terms = list(
                self.go.iter_terms(
                    min_term_size=self.args.min_term_size,
                    max_term_size=self.args.max_term_size,
                ))
        elif os.path.exists(self.args.terms[0]):
            # If parameter is a filename, read term name from a filenamie
            terms = list(
                [self.go[x.strip()] for x in open(args.terms[0]).readlines()])
        else:
            # Generate terms from a parameter list
            terms = list([self.go[x] for x in self.args.terms])
        # Iterate and calculate
        log("Simulating GWAS for {} GO Terms", len(terms))
        min_term_size = np.min([len(x) for x in terms])
        max_term_size = np.max([len(x) for x in terms])
        log("All terms are between {} and {} 'SNPs'", min_term_size,
            max_term_size)

        results = []
        for i, term in enumerate(terms):
            log("-" * 75)
            window_size = self.args.candidate_window_size
            flank_limit = self.args.candidate_flank_limit
            # Generate a series of densities for parameters
            num_genes = len([x for x in term.loci if x in self.cob])
            eloci = [
                x for x in term.effective_loci(window_size=window_size)
                if x in self.cob
            ]
            eloci = self.simulate_missing_candidates(eloci,
                                                     self.args.percent_mcr)
            eloci = self.simulate_false_candidates(eloci,
                                                   self.args.percent_fcr)
            log(
                "GWAS Simulation {}: {} ({}/{} genes in {})",
                i,
                term.id,
                len(eloci),
                num_genes,
                self.cob.name,
            )
            # Make sure that the number of genes is adequate
            if num_genes > self.args.max_term_size:
                log("Too many genes... skipping")
                continue
            elif num_genes < self.args.min_term_size:
                log("Too few genes... skipping")
                continue
            elif num_genes == 0:
                continue
            # Generate candidate genes from the effecive loci
            candidates = self.cob.refgen.candidate_genes(
                eloci, flank_limit=flank_limit)
            log(
                "SNP to gene mapping finds {} genes at window:{} bp, "
                "flanking:{} genes",
                len(candidates),
                self.args.candidate_window_size,
                self.args.candidate_flank_limit,
            )
            overlap = self.overlap(eloci)
            # Dont bother bootstrapping on terms with overlap score below 0
            if overlap.score.mean() < 0:
                continue
            bootstraps = self.generate_bootstraps(eloci, overlap)
            bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean()
            bs_std = bootstraps.groupby("iter").score.apply(np.std).mean()
            # Calculate z scores for density
            overlap["zscore"] = (overlap.score - bs_mean) / bs_std
            bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std
            overlap_pval = (sum(
                bootstraps.groupby("iter").apply(lambda x: x.score.mean()) >=
                overlap.score.mean())) / len(bootstraps.iter.unique())
            # Create a results object
            overlap["COB"] = self.cob.name
            overlap["Ontology"] = self.go.name
            overlap["Term"] = term.id
            overlap["WindowSize"] = self.args.candidate_window_size
            overlap["FlankLimit"] = self.args.candidate_flank_limit
            overlap["FCR"] = args.percent_fcr
            overlap["MCR"] = args.percent_mcr
            overlap["NumRealGenes"] = num_genes
            overlap["NumEffective"] = len(eloci)
            overlap["NumCandidates"] = len(candidates)
            overlap["TermSize"] = len(term)
            overlap["TermCollapsedLoci"] = len(eloci)
            overlap["TermPValue"] = overlap_pval
            overlap["NumBootstraps"] = len(bootstraps.iter.unique())
            overlap["Method"] = self.args.method
            results.append(overlap.reset_index())

        self.results = pd.concat(results)
        self.results.to_csv(args.out, sep="\t", index=False)
Esempio n. 17
0
def snp2gene(args):
    '''
        Perform SNP (locus) to candidate gene mapping
    '''

    if args.out != sys.stdout:
        # Create any non-existant directories
        if os.path.dirname(args.out) != '':
            os.makedirs(os.path.dirname(args.out),exist_ok=True)
        if os.path.exists(args.out) and not args.force:
            print(
                "Output for {} exists! Skipping!".format(
                    args.out
                ),file=sys.stderr
            )
            return None

    # Set a flag saying this is from a COB refgen
    from_cob = False
    # Create the refgen (option to create it from a COB)
    if co.available_datasets('Expr',args.refgen):
        refgen = co.COB(args.refgen).refgen
        from_cob = args.refgen 
    elif co.available_datasets('RefGen',args.refgen):
        refgen = co.RefGen(args.refgen)
    # Create the GWAS object
    ont = co.GWAS(args.gwas)

    if 'all' in args.terms:
        terms = ont.iter_terms()
    else:
        terms = [ont[term] for term in args.terms]

    data = pd.DataFrame()
    results = []
    for term in terms:
        for window_size in args.candidate_window_size:
            for flank_limit in args.candidate_flank_limit:
                if 'effective' in args.snp2gene:
                    # Map to effective
                    effective_loci = term.effective_loci(
                        window_size=window_size
                    )
                elif 'strongest' in args.snp2gene:
                    effective_loci = term.strongest_loci(
                        window_size=window_size,
                        attr=args.strongest_attr,
                        lowest=args.strongest_higher
                    )
                genes = pd.DataFrame([ x.as_dict() for x in 
                    refgen.candidate_genes(
                        effective_loci,
                        flank_limit=flank_limit,
                        include_parent_locus=True,
                        include_num_siblings=True,
                        include_num_intervening=True,
                        include_rank_intervening=True,
                        include_SNP_distance=True,
                        include_parent_attrs=args.include_parent_attrs,
                        attrs={'Term':term.id},
                    )
                ])
                genes['FlankLimit'] = flank_limit
                genes['WindowSize'] = window_size
                genes['RefGen'] = refgen.name
                if from_cob != False:
                    genes['COB'] = from_cob
                data = pd.concat([data,genes])

    # Add data from gene info files
    original_number_genes = len(data)
    for info_file in args.gene_info:
        log('Adding info for {}',info_file)
        # Assume the file is a table
        info = pd.read_table(info_file,sep='\t')
        if len(info.columns) == 1:
            info = pd.read_table(info_file,sep=',')
        # try to match as many columns as possible
        matching_columns = set(data.columns).intersection(info.columns)
        log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns))
        data = pd.merge(data,info,how='left')
        if len(data) != original_number_genes:
            log.warn(
                'There were multiple info rows for some genes. '
                'Beware of potential duplicate candidate gene entries! '
            )
    
    # Generate the output file
    data.to_csv(args.out,index=None,sep='\t')

    log("Summary stats")
    print('-'*100)
    #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit))
    print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique())))
    print("Number of candidate genes per term:")
    print(data.groupby('Term').apply(lambda df: len(df.ID)))