Exemple #1
0
def modelSelect(train_file, order):
    #File path to lm.ini depends on where you run it

    config_file = Config(LMFILE)
    lm_args = config_file.ConfigSectionMap("language_model")
    if(lm_args["model_type"] == "MITLM"):
        #Get vocab...
        print("%s -t %s -write-vocab %s.vocab" % (lm_args["location"],train_file, train_file))
        os.system("%s -t %s -write-vocab %s.vocab" % (lm_args["location"],train_file, train_file))
        #Alternative uses mitlm instead...
        print('%s -order %d -v %s.vocab -unk -smoothing ModKN -t %s -write-lm %s.%dgrams' % (lm_args["location"],order, train_file, train_file, train_file, order))
        os.system('%s -order %d -v %s.vocab -unk -smoothing ModKN -t %s -write-lm %s.%dgrams' % (lm_args["location"],order, train_file, train_file, train_file, order))
    elif(lm_args["model_type"] == "SRILM"):
        #Original Srilm
        print('%s -text %s -lm %s.kn.lm.gz -order %d -unk -kndiscount -interpolate -gt3min 1 -gt4min 1 -gt5min 1' % (lm_args["location"],train_file, train_file, order))
        os.system('%s -text %s -lm %s.kn.lm.gz -order %d -unk -kndiscount -interpolate -gt3min 1 -gt4min 1 -gt5min 1' % (lm_args["location"],train_file, train_file, order))
        print('%s -lm %s.kn.lm.gz -unk -order %d -write-lm %s.%dgrams' % (lm_args["location2"],train_file, order, train_file, order))
        os.system('%s -lm %s.kn.lm.gz -unk -order %d -write-lm %s.%dgrams' % (lm_args["location2"],train_file, order, train_file, order))
        os.system('rm %s.kn.lm.gz' % train_file)
    elif(lm_args["model_type"] == "KENLM"):
        #Kenlm (Old - comparable to srilm
        #print('%s -o %d --interpolate_unigrams 0 <%s >%s.%dgrams'% (lm_args["location"],order, train_file, train_file, order))
        #os.system('%s -o %d --interpolate_unigrams 0 <%s >%s.%dgrams'% (lm_args["location"],order, train_file, train_file, order))
        #(--interpolate_unigrams not recommened)
        print('%s -o %d <%s >%s.%dgrams'% (lm_args["location"],order, train_file, train_file, order))
        os.system('%s -o %d <%s >%s.%dgrams'% (lm_args["location"],order, train_file, train_file, order))
    else:
        print("This is not a recognized language model.")
        print("Check ./lm.ini to make sure \'model\' is one")
        print("of MITLM,SRILM,or KENLM")
Exemple #2
0
def train(input_file_dir, fold_num, order, downSample, splitSelection):
    create_fold(input_file_dir, fold_num, downSample, splitSelection)
    
    pipes = [os.pipe() for i in xrange(fold_num)]

    
    for i in xrange(fold_num):
        pid = os.fork()
        if pid == 0:
            os.close(pipes[i][0])
            
            train_file = '%s/fold%d.train' % (input_file_dir, i)
            discountStr = buildSmoother(order)

            #File path to lm.ini depends on where you run it

            config_file = Config(LMFILE)
            lm_args = config_file.ConfigSectionMap("language_model")
            if(lm_args["model_type"] == "MITLM"):
                #Get vocab...
                print("%s -t %s -write-vocab %s.vocab" % (lm_args["location"],train_file, train_file))
                os.system("%s -t %s -write-vocab %s.vocab" % (lm_args["location"],train_file, train_file))
                #Alternative uses mitlm instead...
                print('%s -order %d -v %s.vocab -unk -smoothing ModKN -t %s -write-lm %s.%dgrams' % (lm_args["location"],order, train_file, train_file, train_file, order))
                os.system('%s -order %d -v %s.vocab -unk -smoothing ModKN -t %s -write-lm %s.%dgrams' % (lm_args["location"],order, train_file, train_file, train_file, order))
            elif(lm_args["model_type"] == "SRILM"):
                #Original Srilm
                print('%s -text %s -lm %s.kn.lm.gz -order %d -unk -kndiscount -interpolate' % (lm_args["location"],train_file, train_file, order))
                os.system('%s -text %s -lm %s.kn.lm.gz -order %d -unk -kndiscount -interpolate' % (lm_args["location"],train_file, train_file, order))
                print('%s -lm %s.kn.lm.gz -unk -order %d -write-lm %s.%dgrams' % (lm_args["location2"],train_file, order, train_file, order))
                os.system('%s -lm %s.kn.lm.gz -unk -order %d -write-lm %s.%dgrams' % (lm_args["location2"],train_file, order, train_file, order))
                os.system('rm %s.kn.lm.gz' % train_file)
            elif(lm_args["model_type"] == "KENLM"):
                #Kenlm
                print('%s -o %d -S %s --interpolate_unigrams 0 <%s >%s.%dgrams'% (lm_args["location"],order, "5%", train_file, train_file, order))
                os.system('%s -o %d -S %s --interpolate_unigrams 0 <%s >%s.%dgrams'% (lm_args["location"],order, "5%" ,train_file, train_file, order))
            else:
                print("This is not a recognized language model.")
                print("Check ./lm.ini to make sure \'model\' is one")
                print("of MITLM,SRILM,or KENLM")


            #Using the berkeleylm as a base instead:
            #print("java -ea -mx1000m -server -cp ~/berkeleylm/src edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText %d %s.%dgrams %s" % (order, train_file, order, train_file))
            #os.system("java -ea -mx1000m -server -cp ~/berkeleylm/src edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText %d %s.%dgrams %s" % (order, train_file, order, train_file)
            sys.exit()
        else:
            os.close(pipes[i][1])
    
    for p in pipes:
        os.wait()
Exemple #3
0
    def __init__(self, text = "", language = "C", config_file = Util.CONFIG):
        #Get the keyword file through the Config and .ini system
        cfg = Config(config_file)
        db_config = cfg.ConfigSectionMap("Keywords")
        self.KeyWordFile = db_config['file']

        self.text = text
        self.functions = []
        self.initialized = False
        self.total_add = 0
        self.total_del = 0
        self.header = "" #What is the name given after '@@' in log
        self.langSwitch = LanguageSwitcherFactory.LanguageSwitcherFactory.createLS(language)
        self.sT = ScopeTrackerFactory.ScopeTrackerFactory.createST(self.langSwitch)
Exemple #4
0
    def test(config_file):
        cfg = Config(config_file)
        db_config = cfg.ConfigSectionMap("Database")
        print("Database configuration = %r\n", db_config)

        dbCon = DatabaseCon(db_config['database'], db_config['user'], db_config['host'], db_config['port'], db_config['password'])

        sql_command = "SELECT language, project, min(commit_date), max(commit_date)"
        sql_command +=  " FROM " + db_config['table'] + " Where language iLike \'java\' group by language, project"

        rows = dbCon.execute(sql_command)

        for r in rows:
            print(r)
Exemple #5
0
class ConfigInfo:
    '''
    This class contains information about the config file
    while providing options to directly access the flags
    section of the .ini file.
    '''
    def __init__(self, newFile):
        self.setConfigFile(newFile)


    def setConfigFile(self, newFile):
        self.CONFIG = newFile
        self.cfg = Config(self.CONFIG)
        option_flags = self.cfg.ConfigSectionMap("Flags")
        self.SEP = option_flags['sep']
        self.DEBUG = bool(util.strtobool(option_flags['debug']))
        self.DEBUGLITE = bool(util.strtobool(option_flags['debuglite']))
        self.DATABASE = bool(util.strtobool(option_flags['database']))
        self.CSV = bool(util.strtobool(option_flags['csv']))
        self.LOGTIME = bool(util.strtobool(option_flags['logtime']))
Exemple #6
0
class dumpLogs:
    def __init__(self, password, c_info):
        self.config_info = c_info
        self.cfg = Config(self.config_info.CONFIG)
        self.dbPass = password
        self.connectDb()
        #self.cleanDb()

    @staticmethod
    def getFullTitleString(keywordDictionary):
        '''
        Create a string specifying not only the database column names
        but also their types.  This is used when automatically creating
        the database table.
        '''

        dictStr = "(project character varying(500), sha text, language character varying(500)," + \
            " file_name text, is_test boolean, method_name text"
        for key, value in keywordDictionary.items():
            dictStr= dictStr+", \""+ str(key).replace(" ", "_").replace("(", "_").replace(")", "_") + \
                "\" integer" #ToStr will add ' around the strings...

        dictStr += ", total_adds integer, total_dels integer, warning_alert boolean)"

        return dictStr

    def connectDb(self):
        self.db_config = self.cfg.ConfigSectionMap("Database")
        logging.debug("Database configuration = %r\n", self.db_config)
        self.dbCon = DatabaseCon(self.db_config['database'], self.db_config['user'], \
                                 self.db_config['host'], self.db_config['port'], \
                                 self.dbPass)

    def cleanDb(self):

        schema = self.db_config['schema']
        response = 'y'  # raw_input("Deleting database %s ?" % (self.db_config['schema']))

        schema = self.db_config['schema']
        tables = []
        tables.append(schema + "." + self.db_config['table_method_detail'])
        tables.append(schema + "." + self.db_config['table_change_summary'])

        if response.lower().startswith('y'):
            for table in tables:
                print(("Deleting table %r \n" % table))
                sql_command = "DELETE FROM " + table
                self.dbCon.insert(sql_command)

        self.dbCon.commit()

    def close(self):
        self.dbCon.commit()
        self.dbCon.close()

    #TODO: Improve security here for possible injections?
    def createSummaryTable(self):
        schema = self.db_config['schema']
        table = schema + "." + self.db_config['table_change_summary']
        user = self.db_config['user']

        sql_command = "CREATE TABLE IF NOT EXISTS " + table + " (project character varying(500) NOT NULL," + \
            " sha text NOT NULL, author character varying(500), author_email character varying(500)," + \
            " commit_date date, is_bug boolean,"+ \
            " CONSTRAINT change_summary_pkey PRIMARY KEY (project, sha)) WITH (OIDS=FALSE);"
        self.dbCon.create(sql_command)
        #self.dbCon.create("ALTER TABLE " + table + " OWNER TO " + user + ";")
        #self.dbCon.create("GRANT ALL ON TABLE " + table + " TO " + user + ";")

    def createMethodChangesTable(self, titleString):
        schema = self.db_config['schema']
        table = schema + "." + self.db_config['table_method_detail']
        user = self.db_config['user']

        sql_command = "CREATE TABLE IF NOT EXISTS " + table + titleString + " WITH (OIDS=FALSE);"
        self.dbCon.create(sql_command)
        #self.dbCon.create("ALTER TABLE " + table + " OWNER TO " + user + ";")
        #self.dbCon.create("GRANT ALL ON TABLE " + table + " TO " + user + ";")

    def dumpSummary(self, summaryStr):

        schema = self.db_config['schema']
        table = schema + "." + self.db_config['table_change_summary']

        sql_command = "INSERT INTO " + table + \
                      "(project, sha, author, author_email, commit_date, is_bug)" + \
                      " VALUES (" + summaryStr + ")"

        #print sql_command
        self.dbCon.insert(sql_command)
        #self.dbCon.commit()

    def dumpMethodChanges(self, methodChange, titleString):

        schema = self.db_config['schema']
        table = schema + "." + self.db_config['table_method_detail']

        #sql_command = "INSERT INTO " + table + \
        #            "(project, sha, language, file_name, is_test, method_name, assertion_add, " + \
        #            "assertion_del, total_add, total_del)" + \
        #            "VALUES (" + methodChange + ")"

        sql_command = "INSERT INTO " + table + titleString + " VALUES (" + methodChange + ")"

        if (self.config_info.DEBUG):
            print(sql_command)

        self.dbCon.insert(sql_command)
parser.add_argument('-c',
                    '--command',
                    help='counter, condition, policy, vmprofile or vmgroup',
                    required=True)
parser.add_argument('-o',
                    '--option',
                    help='list, create or delete',
                    required=True)
args = parser.parse_args()

if len(sys.argv) <= 1:
    parser.print_help()
    sys.exit(1)

config = Config()
api = config.ConfigSectionMap("ConfigApi")['api']
apikey = config.ConfigSectionMap("ConfigApi")['apikey']
secret = config.ConfigSectionMap("ConfigApi")['secret']

cloudstack = CloudStack.Client(api, apikey, secret)

project = config.ConfigSectionMap("Envs")['project']
zone = config.ConfigSectionMap("Envs")['zone']
projectid = listProjectId(project)
zoneid = listZoneId(zone)

if args.option == 'list':
    if args.command == 'counter':
        print Colors.BOLD + "Listing counters:" + Colors.ENDC
        print listCounters()
    elif args.command == 'condition':
Exemple #8
0
    def processLog(self, config=""):
        if (config == ""):
            config = self.config_info.CONFIG

        signal.signal(signal.SIGALRM, timeout)

        project1 = os.path.split(self.log_file)[0]
        project1 = project1.rstrip(os.sep)
        self.project_name = os.path.basename(project1)
        print(("---------- %s ------------\n" % (self.project_name)))

        if (self.config_info.DATABASE):
            dl = dumpLogs(self.dbPass, self.config_info)

        if (self.config_info.CSV):
            if not os.path.isdir("../Results"):
                os.mkdir("../Results")
            inf1 = open(
                "../Results/" + str(self.project_name) + "ChangeSummary.csv",
                'w')
            fPtrChangeSummary = open("../Results/" + "ChangeSummary.csv", 'w')

            inf1.write("project,sha,author,author_email,commit_date,is_bug\n")

            inf2 = open(
                "../Results/" + str(self.project_name) + "PatchSummary.csv",
                'w')
            fPtrPatchSummary = open("../Results/" + "PatchSummary.csv", 'w')

            lst = []
            listToDict = {}
            mockChunk = logChunk(
                "", "C", self.config_info
            )  #TODO: This is C specific,  Why is this C specific?
            lst = mockChunk.readKeywords(lst)
            keywords = [k[0] for k in lst if k[1] == INCLUDED]
            for keyword in keywords:
                listToDict["\"" + str(keyword) + "\" adds"] = 0
                listToDict["\"" + str(keyword) + "\" dels"] = 0

            inf2.write(
                "project, sha, language, file_name, is_test, method_name,total_add,total_del,%s\n"
                % ",".join(sorted(listToDict.keys())))

        inf = codecs.open(self.log_file, "r", "iso-8859-1")

        shaObj = None
        patchObj = None
        is_diff = False
        log_mssg = ""
        is_no_prev_ver = False
        is_no_next_ver = False
        curLogChunk = logChunk("", "C", self.config_info)
        linenum = 0

        for l in inf:

            try:
                signal.alarm(0)

                sha = self.isSha(l)
                line = l

                #if(self.config_info.DEBUGLITE):
                #    try:
                #        print(line)
                #    except:
                #        pass

                if sha:
                    #Reverting back to version that outputs at the end...
                    #if(shaObj != None):
                    #    if(self.config_info.DEBUGLITE):
                    #        print("Writing Sha:" + sha)

                    #    if(self.config_info.DATABASE):
                    #        shaObj.dumpSha(dl)
                    #    elif(self.config_info.CSV):
                    #        shaObj.shaToCsv(inf1,inf2,fPtrChangeSummary,fPtrPatchSummary)
                    #    else:
                    #        shaObj.printSha()

                    shaObj = Sha(self.project_name, sha)
                    #if(self.config_info.DEBUGLITE): #Save for testing.
                    self.shas.append(
                        shaObj
                    )  #This will become very memory intensive in large git logs.

                    is_diff = False
                    log_mssg = ""

                    continue

                elif self.isAuthor(line, shaObj):
                    continue

                elif self.isDate(line, shaObj):
                    continue

                fullLine = line
                line = line.rstrip()

                if line.startswith('diff --git '):
                    shaObj.setLog(log_mssg)
                    is_diff = True
                    is_no_prev_ver = False
                    is_no_next_ver = False
                    continue

                    if patchObj != None:
                        shaObj.patches.append(patchObj)

                elif is_diff == False:
                    if not line.strip():
                        continue
                    log_mssg += line + "\t"

                if is_diff:
                    if line.startswith("--- a/"):
                        #Finish the changes to the old patch object
                        if (patchObj != None):
                            #If there is an existing chunk to parse, process it
                            if (curLogChunk.header != ""):
                                if (self.config_info.DEBUG):
                                    print(("New diff with previous version: " +
                                           line))
                                    print(("HEADER: " + curLogChunk.header))
                                self.processLastChunk(patchObj, curLogChunk)

                            #Reset the current chunk obj
                            if (self.config_info.DEBUG):
                                print("Resetting.")
                            curLogChunk.reset()
                            curLogChunk.setLang(
                                "." + self.cur_lang)  #DOUBLE CHECK ME!

                        patchObj = self.createPatch(line)
                        shaObj.patches.append(patchObj)
                        #print patchObj
                        #print shaObj.patches
                    elif (line == '--- /dev/null'):  #earlier file was empty
                        is_no_prev_ver = True
                    elif (line == '+++ /dev/null'
                          ):  #next file version was empty
                        is_no_next_ver = True
                        continue
                    elif (is_no_prev_ver
                          == True) and line.startswith("+++ b/"):
                        #Finish the changes to the old patch object
                        if (patchObj != None):
                            if (curLogChunk.header !=
                                    ""):  #If there is an existing chunk
                                if (self.config_info.DEBUG):
                                    print((
                                        "New diff with no previous version: " +
                                        line))
                                    print(("HEADER: " + curLogChunk.header))
                                self.processLastChunk(patchObj, curLogChunk)

                                if (self.config_info.DEBUG):
                                    print("Resetting.")
                                curLogChunk.reset()
                                curLogChunk.setLang(
                                    "." + self.cur_lang)  #DOUBLE CHECK ME!

                        patchObj = self.createPatchWithNoPrevVersion(line)
                        shaObj.patches.append(patchObj)
                    else:  #Then we reached a content line.
                        self.processPatch(fullLine, patchObj, curLogChunk)

            except TimeExceededError.TimeExceededError:
                print("Line Timed out, moving to next.")
                continue

        #Clear timeouts.
        signal.alarm(0)

        #Make sure to get the last patch in the file!
        if (curLogChunk.header != ""):  #If there is an existing chunk to parse
            if (self.config_info.DEBUG):
                print(("Last Patch: " + line))
                print(("HEADER: " + curLogChunk.header))
            self.processLastChunk(patchObj, curLogChunk)

        #if shaObj != None:
        #    shaObj.patches.append(patchObj)

        parseFinish = datetime.now()

        if (self.shas != []):  #If the log wasn't empty...
            #Create the change summary table and the method change table now if necessary
            if (self.config_info.DATABASE):
                cfg = Config(self.config_info.CONFIG)
                keywordFile = cfg.ConfigSectionMap("Keywords")
                full_title = dumpLogs.getFullTitleString(
                    curLogChunk.getEmptyKeywordDict())

                dl.createSummaryTable()

                if (
                        full_title != ""
                ):  #Check if the changes table exists and create it if we have a title.
                    dl.createMethodChangesTable(full_title)

            for s in self.shas:
                #s.printSha()
                if s != None:
                    if (self.config_info.DATABASE):
                        s.dumpSha(dl)
                    elif (self.config_info.CSV):
                        s.shaToCsv(inf1, inf2, fPtrChangeSummary,
                                   fPtrPatchSummary)
                    else:
                        s.printSha()

        #Write out last sha.
        #if(shaObj != None and self.config_info.DATABASE):
        #    if(self.config_info.DEBUGLITE):
        #        print("Writing to db.")
        #    shaObj.dumpSha(dl)

        if (self.config_info.DATABASE):
            print("Closing Time.")
            dl.close()

        if (self.config_info.CSV):
            inf1.close()
            inf2.close()
            fPtrChangeSummary.close()
            fPtrPatchSummary.close()

        print("Sha's processed:")
        print((len(self.shas)))

        return parseFinish
Exemple #9
0
class ConfigInfo:
  
  def __init__(self, newFile):

    self.configFile = newFile
    
    self.cfg = Config(self.configFile)

    
    self.config_db    = self.cfg.ConfigSectionMap("Database")
    self.config_repo  = self.cfg.ConfigSectionMap("Repos")
    self.config_key   = self.cfg.ConfigSectionMap("Keywords")
    self.config_log   = self.cfg.ConfigSectionMap("Log")    
    self.config_flags = self.cfg.ConfigSectionMap("Flags")

    
    self.setFlags()



  def setFlags(self):
    self.SEP          = self.config_flags['sep'].strip('\'')
    self.DEBUG        = bool(util.strtobool(self.config_flags['debug']))
    self.DEBUGLITE    = bool(util.strtobool(self.config_flags['debuglite']))
    self.DATABASE     = bool(util.strtobool(self.config_flags['database']))
    self.CSV          = bool(util.strtobool(self.config_flags['csv']))
    self.LOGTIME      = bool(util.strtobool(self.config_flags['logtime']))
    self.SZZ          = bool(util.strtobool(self.config_flags['szz']))
    

  def getRepos(self):

      repos = set() 
  
      try:
        repo_file = self.config_repo['repo_url_file']
        f = open(repo_file, 'r')
        for line in f:
          repo_url = line.strip()
          _, repo = repo_url.split(os.sep) 
          repos.add(repo)
      except IOError:
        print "!! Repo url file \"%s\" does not exist." % repo_file
        print "... Going to process all the repositories in the directory : \"%s\"." % self.getDumpLocation()
        repo_file = None
    
      return repos
      
  def getGitUrl(self,projName):

      git_url = ""
  
      try:
        repo_file = self.config_repo['repo_url_file']
        f = open(repo_file, 'r')
        for line in f:
          repo_url = line.strip()
          url, repo = repo_url.split(os.sep) 
          if repo == projName:
            git_url = "http://github.com/" + repo_url
            break
          
      except IOError:
        print "!! Repo url file \"%s\" does not exist." % repo_file
        print "... Going to process all the repositories in the directory : \"%s\"." % self.getDumpLocation()
        repo_file = None
    
      return git_url

  def getProjectLocation(self, projName):

      dump_loc = self.config_repo['repo_locations']
      proj_loc = os.path.join(dump_loc,projName)

      return proj_loc
  
  def getDumpLocation(self):
    return self.config_repo['repo_locations']
  
  def getPatchMode(self):
      try:
        patch = bool(util.strtobool(self.config_log['patch']))
        
      except:
        patch = True
      return patch
  
  def getBugPatchMode(self):
      try:
        bug_patch = bool(util.strtobool(self.config_log['bugPatch']))
        
      except:
        bug_patch = True
      return bug_patch
      
  def getLanguages(self):
     try:
       langs = self.config_log['languages'].split(",")
     except:
       langs = [] #Treat empty as showing all supported languages.
     return langs
Exemple #10
0
class dumpLogs:

  def __init__(self, configFile='config.ini'):

    self.cfg = Config(configFile)
    self.connectDb()
    #self.cleanDb()


  def connectDb(self):

    self.db_config = self.cfg.ConfigSectionMap("Database")
    logging.debug("Database configuration = %r\n", self.db_config)
    self.dbCon = DatabaseCon(self.db_config['database'], self.db_config['user'], \
                             self.db_config['host'], self.db_config['port'])


  def cleanDb(self):

    schema = self.db_config['schema']
    response = 'y' # raw_input("Deleting database %s ?" % (self.db_config['schema']))

    schema = self.db_config['schema']
    tables = []
    tables.append(schema + "." + self.db_config['table_method_detail'])
    tables.append(schema + "." + self.db_config['table_change_summary'])

    if response.lower().startswith('y'):
       for table in tables:
         print("Deleting table %r \n" % table)
         sql_command = "DELETE FROM " + table
         self.dbCon.insert(sql_command)

    self.dbCon.commit()


  def close(self):
    self.dbCon.commit()
    self.dbCon.close()

  def dumpSummary(self, summaryStr):

    schema = self.db_config['schema']
    table = schema + "." + self.db_config['table_change_summary']

    sql_command = "INSERT INTO " + table + \
                "(project, sha, author, commit_date, is_bug)" + \
                "VALUES (" + summaryStr + ")"

    #print sql_command
    self.dbCon.insert(sql_command)
    #self.dbCon.commit()

  def dumpMethodChanges(self, methodChange):

    schema = self.db_config['schema']
    table = schema + "." + self.db_config['table_method_detail']

    sql_command = "INSERT INTO " + table + \
                "(project, sha, language, file_name, is_test, method_name, assertion_add, " + \
                "assertion_del, total_add, total_del)" + \
                "VALUES (" + methodChange + ")"

    #print sql_command
    self.dbCon.insert(sql_command)