Ejemplo n.º 1
0
def main(augur_url, host, port):
    """ Declares singular worker and creates the server and flask app that it will be running on
    """
    app = Flask(__name__)

    #load credentials
    broker_host = read_config("Server", "host", "AUGUR_HOST", "0.0.0.0")
    broker_port = read_config("Server", "port", "AUGUR_PORT", 5000)
    database_host = read_config('Database', 'host', 'AUGUR_DB_HOST', 'host')
    worker_info = read_config('Workers', 'repo_info_worker', None, None)

    worker_port = worker_info['port'] if 'port' in worker_info else port

    while True:
        try:
            r = requests.get("http://{}:{}/AUGWOP/heartbeat".format(host, worker_port)).json()
            if 'status' in r:
                if r['status'] == 'alive':
                    worker_port += 1
        except:
            break

    logging.basicConfig(filename='worker_{}.log'.format(worker_port), filemode='w', level=logging.INFO)

    config = { 
            "id": "com.augurlabs.core.repo_info_worker.{}".format(worker_port),
            "broker_port": broker_port,
            "broker_host": broker_host,
            "location": "http://{}:{}".format(read_config('Server', 'host', 'AUGUR_HOST', 'localhost'),worker_port),
            "host": database_host,
            "key": read_config("Database", "key", "AUGUR_GITHUB_API_KEY", "key"),
            "password": read_config('Database', 'password', 'AUGUR_DB_PASSWORD', 'password'),
            "port": read_config('Database', 'port', 'AUGUR_DB_PORT', 'port'),
            "user": read_config('Database', 'user', 'AUGUR_DB_USER', 'user'),
            "database": read_config('Database', 'name', 'AUGUR_DB_NAME', 'database'),
            "endpoint": "https://bestpractices.coreinfrastructure.org/projects.json",
            "display_name": "",
            "description": "",
            "required": 1,
            "type": "string"
        }

    #create instance of the worker
    app.gh_repo_info_worker = GHRepoInfoWorker(config) # declares the worker that will be running on this server with specified config

    create_server(app, None)
    logging.info("Starting Flask App with pid: " + str(os.getpid()) + "...")
    app.run(debug=app.debug, host=host, port=worker_port)
    if app.gh_repo_info_worker._child is not None:
        app.gh_repo_info_worker._child.terminate()
    try:
        requests.post('http://{}:{}/api/unstable/workers/remove'.format(server['host'],server['port']), json={"id": config['id']})
    except:
        pass

    logging.info("Killing Flask App: " + str(os.getpid()))
    os.kill(os.getpid(), 9)
Ejemplo n.º 2
0
def test_read_config_no_exception():
    db_name = read_config('Database',
                          'user',
                          'AUGUR_DB_USER',
                          'augur',
                          config_file_path="augur.config.json")
    assert db_name == "augur"
Ejemplo n.º 3
0
    def __init__(self):
        self.upstream_db = 7
        self.cursor = None
        self.cursor_people = None

        self.db = None
        self.db_people = None

        worker_options = read_config("Workers", "facade_worker", None, None)
        if 'repo_directory' in worker_options:
            self.repo_base_directory = worker_options['repo_directory']
        else:
            self.log_activity(
                'Error', "Please specify a \'repo_directory\' parameter"
                " in your \'Workers\' -> \'facade_worker\' object in your config "
                "to the directory in which you want to clone repos. Exiting..."
            )
            sys.exit(1)
        self.tool_source = '\'FacadeAugur\''
        self.tool_version = '\'0.0.1\''
        self.data_source = '\'git_repository\''

        # Figure out how much we're going to log
        logging.basicConfig(filename='worker_{}.log'.format(
            worker_options['port']),
                            filemode='w',
                            level=logging.INFO)
        self.log_level = None  #self.get_setting('log_level')
Ejemplo n.º 4
0
    def log_activity(self, level, status):

        # Log an activity based upon urgency and user's preference.  If the log level is
        # "Debug", then just print it and don't save it in the database.

        log_options = ('Error', 'Quiet', 'Info', 'Verbose', 'Debug')
        logging.info("* %s\n" % status)
        if self.log_level == 'Debug' and level == 'Debug':
            return

        #if log_options.index(level) <= log_options.index(self.log_level):
        query = ("INSERT INTO utility_log (level,status) VALUES (%s,%s)")
        try:
            self.cursor.execute(query, (level, status))
            self.db.commit()
        except Exception as e:
            logging.info('Error encountered: {}\n'.format(e))

            # Set up the database
            db_user = read_config('Database', 'user', 'AUGUR_DB_USER', 'augur')
            db_pass = read_config('Database', 'password', 'AUGUR_DB_PASSWORD',
                                  'augur')
            db_name = read_config('Database', 'name', 'AUGUR_DB_NAME', 'augur')
            db_host = read_config('Database', 'host', 'AUGUR_DB_HOST',
                                  'localhost')
            db_port = read_config('Database', 'port', 'AUGUR_DB_PORT', 5432)
            db_user_people = db_user
            db_pass_people = db_pass
            db_name_people = db_name
            db_host_people = db_host
            db_port_people = db_port
            # Open a general-purpose connection
            db, cursor = self.database_connection(db_host, db_user, db_pass,
                                                  db_name, db_port, False,
                                                  False)
            self.cursor.execute(query, (level, status))
            self.db.commit()
Ejemplo n.º 5
0
    def commit_model(self):

        # Figure out what we need to do
        limited_run = read_config("Facade", name="limited_run", default=0)
        delete_marked_repos = read_config("Facade", name="delete_marked_repos", default=0)
        pull_repos = read_config("Facade", name="pull_repos", default=0)
        clone_repos = read_config("Facade", name="clone_repos", default=1)
        check_updates = read_config("Facade", name="check_updates", default=0)
        force_updates = read_config("Facade", name="force_updates", default=0)
        run_analysis = read_config("Facade", name="run_analysis", default=0)
        force_analysis = read_config("Facade", name="force_analysis", default=0)
        nuke_stored_affiliations = read_config("Facade", name="nuke_stored_affiliations", default=0)
        fix_affiliations = read_config("Facade", name="fix_affiliations", default=1)
        force_invalidate_caches = read_config("Facade", name="force_invalidate_caches", default=0)
        rebuild_caches = read_config("Facade", name="rebuild_caches", default=1) #if abs((datetime.datetime.strptime(self.cfg.get_setting('aliases_processed')[:-3], 
            # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(self.cfg.get_setting(
            #   'update_frequency')) else 0
        force_invalidate_caches = read_config("Facade", name="force_invalidate_caches", default=0)
        create_xlsx_summary_files = read_config("Facade", name="create_xlsx_summary_files", default=0)
        multithreaded = read_config("Facade", name="multithreaded", default=1)

        opts,args = getopt.getopt(sys.argv[1:],'hdpcuUaAmnfIrx')
        for opt in opts:
            if opt[0] == '-h':
                print("\nfacade-worker.py does everything by default except invalidating caches\n"
                        "and forcing updates, unless invoked with one of the following options.\n"
                        "In those cases, it will only do what you have selected.\n\n"
                        "Options:\n"
                        "   -d  Delete marked repos\n"
                        "   -c  Run 'git clone' on new repos\n"
                        "   -u  Check if any repos should be marked for updating\n"
                        "   -U  Force all repos to be marked for updating\n"
                        "   -p  Run 'git pull' on repos\n"
                        "   -a  Analyze git repos\n"
                        "   -A  Force all repos to be analyzed\n"
                        "   -m  Disable multithreaded mode (but why?)\n"
                        "   -n  Nuke stored affiliations (if mappings modified by hand)\n"
                        "   -f  Fill empty affiliations\n"
                        "   -I  Invalidate caches\n"
                        "   -r  Rebuild unknown affiliation and web caches\n"
                        "   -x  Create Excel summary files\n\n")
                sys.exit(0)

            elif opt[0] == '-d':
                delete_marked_repos = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: delete marked repos.')

            elif opt[0] == '-c':
                clone_repos = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: clone new repos.')

            elif opt[0] == '-u':
                check_updates = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: checking for repo updates')

            elif opt[0] == '-U':
                force_updates = 1
                self.cfg.log_activity('Info','Option set: forcing repo updates')

            elif opt[0] == '-p':
                pull_repos = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: update repos.')

            elif opt[0] == '-a':
                run_analysis = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: running analysis.')

            elif opt[0] == '-A':
                force_analysis = 1
                run_analysis = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: forcing analysis.')

            elif opt[0] == '-m':
                multithreaded = 0
                self.cfg.log_activity('Info','Option set: disabling multithreading.')

            elif opt[0] == '-n':
                nuke_stored_affiliations = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: nuking all affiliations')

            elif opt[0] == '-f':
                fix_affiliations = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: fixing affiliations.')

            elif opt[0] == '-I':
                force_invalidate_caches = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: Invalidate caches.')

            elif opt[0] == '-r':
                rebuild_caches = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: rebuilding caches.')

            elif opt[0] == '-x':
                create_xlsx_summary_files = 1
                limited_run = 1
                self.cfg.log_activity('Info','Option set: creating Excel summary files.')

        # Get the location of the directory where git repos are stored
        repo_base_directory = self.cfg.repo_base_directory

        # Determine if it's safe to start the script
        current_status = self.cfg.get_setting('utility_status')

        if current_status != 'Idle':
            self.cfg.log_activity('Error','Something is already running, aborting maintenance '
                'and analysis.\nIt is unsafe to continue.')
            # sys.exit(1)

        if len(repo_base_directory) == 0:
            self.cfg.log_activity('Error','No base directory. It is unsafe to continue.')
            update_status('Failed: No base directory')
            sys.exit(1)
            
        # Begin working

        start_time = time.time()
        self.cfg.log_activity('Quiet','Running facade-worker')

        if not limited_run or (limited_run and delete_marked_repos):
            git_repo_cleanup(self.cfg)

        if not limited_run or (limited_run and clone_repos):
            git_repo_initialize(self.cfg)

        if not limited_run or (limited_run and check_updates):
            check_for_repo_updates(self.cfg)

        if force_updates:
            force_repo_updates(self.cfg)

        if not limited_run or (limited_run and pull_repos):
            git_repo_updates(self.cfg)

        if force_analysis:
            force_repo_analysis(self.cfg)

        if not limited_run or (limited_run and run_analysis):
            analysis(self.cfg, multithreaded)

        if nuke_stored_affiliations:
            nuke_affiliations(self.cfg)

        if not limited_run or (limited_run and fix_affiliations):
            fill_empty_affiliations(self.cfg)

        if force_invalidate_caches:
            invalidate_caches(self.cfg)

        if not limited_run or (limited_run and rebuild_caches):
            rebuild_unknown_affiliation_and_web_caches(self.cfg)

        if not limited_run or (limited_run and create_xlsx_summary_files):

            self.cfg.log_activity('Info','Creating summary Excel files')

            # from excel_generators import *

            self.cfg.log_activity('Info','Creating summary Excel files (complete)')



        # All done

        self.cfg.update_status('Idle')
        self.cfg.log_activity('Quiet','facade-worker.py completed')

        elapsed_time = time.time() - start_time

        print('\nCompleted in %s\n' % datetime.timedelta(seconds=int(elapsed_time)))

        self.cfg.cursor.close()
        self.cfg.cursor_people.close()
        self.cfg.db.close()
        self.cfg.db_people.close()
Ejemplo n.º 6
0
def test_read_config_exception():
    with pytest.raises(AttributeError):
        db_name = read_config('Server', 'username')
Ejemplo n.º 7
0
def main(augur_url, host, port):
    """ Declares singular worker and creates the server and flask app that it will be running on
    """
    app = Flask(__name__)

    #load credentials
    broker_host = read_config("Server", "host", "AUGUR_HOST", "0.0.0.0")
    broker_port = read_config("Server", "port", "AUGUR_PORT", 5000)
    database_host = read_config('Database', 'host', 'AUGUR_DB_HOST', 'host')
    worker_info = read_config('Workers', 'insight_worker', None, {})

    worker_port = worker_info['port'] if 'port' in worker_info else port

    while True:
        try:
            r = requests.get("http://{}:{}/AUGWOP/heartbeat".format(host, worker_port)).json()
            if 'status' in r:
                if r['status'] == 'alive':
                    worker_port += 1
        except:
            break

    logging.basicConfig(filename='worker_{}.log'.format(worker_port), filemode='w', level=logging.INFO)

    config = { 
            "id": "com.augurlabs.core.insight_worker.{}".format(worker_port),
            "broker_port": broker_port,
            "broker_host": broker_host,
            "location": "http://{}:{}".format(read_config('Server', 'host', 'AUGUR_HOST', 'localhost'),worker_port),
            "host": database_host,
            "key": read_config("Database", "key", "AUGUR_GITHUB_API_KEY", "key"),
            "password": read_config('Database', 'password', 'AUGUR_DB_PASSWORD', 'password'),
            "port": read_config('Database', 'port', 'AUGUR_DB_PORT', 'port'),
            "user": read_config('Database', 'user', 'AUGUR_DB_USER', 'user'),
            "database": read_config('Database', 'name', 'AUGUR_DB_NAME', 'database'),
            "endpoint": "https://bestpractices.coreinfrastructure.org/projects.json",
            "anomaly_days": worker_info['anomaly_days'] if 'anomaly_days' in worker_info else 2,
            "training_days": worker_info['training_days'] if 'training_days' in worker_info else 365,
            "confidence_interval": worker_info['confidence_interval'] if 'confidence_interval' in worker_info else .95,
            "contamination": worker_info['contamination'] if 'contamination' in worker_info else 0.041,
            'metrics': worker_info['metrics'] if 'metrics' in worker_info else {"issues-new": "issues", 
                "code-changes": "commit_count", "code-changes-lines": "added", 
                "reviews": "pull_requests", "contributors-new": "new_contributors"}
        }

    #create instance of the worker
    app.insight_worker = InsightWorker(config) # declares the worker that will be running on this server with specified config
    
    create_server(app, None)
    print("Starting Flask App on host {} with port {} with pid: ".format(broker_host, worker_port) + str(os.getpid()) + "...")
    app.run(debug=app.debug, host=host, port=worker_port)
    print("Killing Flask App: {} and telling broker that this worker is disconnected.".format(str(os.getpid())))
    try:
        logging.info("Sending disconnected message to broker... @ -> {} with info: {}\n".format('http://{}:{}/api/unstable/workers'.format(
            config['broker_host'], config['broker_port']), config))
        requests.post('http://{}:{}/api/unstable/workers/remove'.format(
            config['broker_host'], config['broker_port']), json=config) #hello message
    except Exception as e:
        logging.info("Ran into error: {}".format(e))
        logging.info("Broker's port is busy, worker will not be able to accept tasks, "
            "please restart Augur if you want this worker to attempt connection again.")
Ejemplo n.º 8
0
def analyze_commit(cfg, repo_id, repo_loc, commit, multithreaded):

    # This function analyzes a given commit, counting the additions, removals, and
    # whitespace changes. It collects all of the metadata about the commit, and
    # stashes it in the database.  A new database connection is opened each time in
    # case we are running in multithreaded mode, since MySQL cursors are not
    # currently threadsafe.

    ### Local helper functions ###

    def check_swapped_emails(name, email):

        # Sometimes people mix up their name and email in their git settings

        if name.find('@') >= 0 and email.find('@') == -1:
            cfg.log_activity('Debug',
                             'Found swapped email/name: %s/%s' % (email, name))
            return email, name
        else:
            return name, email

    def strip_extra_amp(email):

        # Some repos have multiple ampersands, which really messes up domain pattern
        # matching. This extra info is not used, so we discard it.

        if email.count('@') > 1:
            cfg.log_activity('Debug', 'Found extra @: %s' % email)
            return email[:email.find('@', email.find('@') + 1)]
        else:
            return email

    def discover_alias(email):

        # Match aliases with their canonical email
        fetch_canonical = ("SELECT canonical_email "
                           "FROM contributors_aliases "
                           "WHERE alias_email=%s "
                           "AND cntrb_active = 1")

        cursor_people_local.execute(fetch_canonical, (email, ))
        db_people_local.commit()

        canonical = list(cursor_people_local)

        if canonical:
            for email in canonical:
                return email[0]
        else:
            return email

    def update_contributors(author_em, committer_em, auth_nm, cmtr_nm):

        #Check if an email already exists in the database for either the committer or the author
        #There is a committer and an author on each commit, but only one record in the contributor table (ideally)
        # For each email address. So, for each email address, we need to check if it exists in the contributor
        # Table.
        def contributor_exists(some_email):

            #SQL String to insert values into the contributors table
            some_email = some_email.replace("'", "")
            email_check = (
                """SELECT cntrb_email, tool_source, tool_version, data_source FROM contributors WHERE cntrb_email = '{}'"""
                .format(some_email))

            cursor_local.execute(email_check)

            if cursor_local.fetchone() is not None:
                db_local.commit()
                emails_to_add = some_email
                return True
            else:
                return False

        #SQL to update the contributors table
        cntrb = (
            "INSERT INTO contributors "
            "(cntrb_email,cntrb_canonical,cntrb_full_name,tool_source, tool_version, data_source) "
            "VALUES (%s,%s,%s,'FacadeAugur','0.0.1','git_repository')")

        ## Logic block for updating contributors.
        if contributor_exists(author_em):
            cfg.log_activity(
                'Info', 'Author contributor record already exists: {}'.format(
                    author_em))
        else:
            # add a contributor record for the author
            cursor_local.execute(
                cntrb, (author_em, discover_alias(author_em), str(auth_nm)))
            db_local.commit()
            cfg.log_activity(
                'Info',
                'Stored author contributor with email: {}'.format(author_em))

        if contributor_exists(committer_em):
            cfg.log_activity(
                'Info', 'Author contributor record already exists: {}'.format(
                    committer_em))
        else:
            #add a contributor record for the committer
            cursor_local.execute(
                cntrb,
                (committer_em, discover_alias(committer_em), str(cmtr_nm)))
            db_local.commit()
            cfg.log_activity(
                'Info', 'Stored committer contributor with email: {}'.format(
                    committer_em))

    def store_commit(repos_id, commit, filename, author_name, author_email,
                     author_date, author_timestamp, committer_name,
                     committer_email, committer_date, committer_timestamp,
                     added, removed, whitespace):

        # Fix some common issues in git commit logs and store data.

        # Sometimes git is misconfigured and name/email get swapped
        author_name, author_email = check_swapped_emails(
            author_name, author_email)
        committer_name, committer_email = check_swapped_emails(
            committer_name, committer_email)

        # Some systems append extra info after a second @
        author_email = strip_extra_amp(author_email)
        committer_email = strip_extra_amp(committer_email)

        store = ("""INSERT INTO commits (repo_id,cmt_commit_hash,cmt_filename,
			cmt_author_name,cmt_author_raw_email,cmt_author_email,cmt_author_date,cmt_author_timestamp,
			cmt_committer_name,cmt_committer_raw_email,cmt_committer_email,cmt_committer_date,cmt_committer_timestamp,
			cmt_added,cmt_removed,cmt_whitespace, cmt_date_attempted, tool_source, tool_version, data_source)
			VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""")

        try:
            cursor_local.execute(store, (
                repos_id,
                str(commit),
                filename,
                str(author_name),
                author_email,
                discover_alias(author_email),
                author_date,
                author_timestamp,
                committer_name,
                committer_email,
                discover_alias(committer_email),
                committer_date,
                committer_timestamp,
                added,
                removed,
                whitespace,
                committer_date,
                cfg.tool_source,
                cfg.tool_version,
                cfg.data_source,
            ))

            db_local.commit()
        except:
            try:
                cfg.log_activity(
                    'Info',
                    """Timezone error caught, inspect values: INSERT INTO commits (repo_id,cmt_commit_hash,cmt_filename,
				cmt_author_name,cmt_author_raw_email,cmt_author_email,cmt_author_date,cmt_author_timestamp,
				cmt_committer_name,cmt_committer_raw_email,cmt_committer_email,cmt_committer_date,cmt_committer_timestamp,
				cmt_added,cmt_removed,cmt_whitespace, cmt_date_attempted, tool_source, tool_version, data_source)
				VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""".
                    format(repos_id, str(commit), filename,
                           str(author_name), author_email,
                           discover_alias(author_email), author_date,
                           author_timestamp, committer_name, committer_email,
                           discover_alias(committer_email), committer_date,
                           committer_timestamp, added, removed, whitespace,
                           committer_date, cfg.tool_source, cfg.tool_version,
                           cfg.data_source))
            except:
                cfg.log_activity(
                    'Info', 'Something wrong in error log for timezone error')

        cfg.log_activity('Debug', 'Stored commit: %s' % commit)

        # Check if email already exists in db
#		email_check = ("""SELECT cntrb_email, tool_source, tool_version, data_source
#			FROM contributors WHERE cntrb_email = {augur_email} OR cntrb_email = {committer_email}}""")

## Commented out so as to not update contributors
## sean: 11/6/2019
## Goal: Address with the contributors model worker
# try:
# 	update_contributors(author_email, committer_email, author_name, committer_name)
# except Exception: #print(e)
# 	cfg.log_activity('Info', str(traceback.print_exc()))

### The real function starts here ###

    header = True
    filename = ''
    filename = ''
    added = 0
    removed = 0
    whitespace = 0

    db_user = read_config('Database', 'user', 'AUGUR_DB_USER', 'augur')
    db_pass = read_config('Database', 'password', 'AUGUR_DB_PASSWORD', 'augur')
    db_name = read_config('Database', 'name', 'AUGUR_DB_NAME', 'augur')
    db_host = read_config('Database', 'host', 'AUGUR_DB_HOST', 'localhost')
    db_port = read_config('Database', 'port', 'AUGUR_DB_PORT', 5432)
    db_user_people = db_user
    db_pass_people = db_pass
    db_name_people = db_name
    db_host_people = db_host
    db_port_people = db_port

    # Set up new threadsafe database connections if multithreading. Otherwise
    # use the gloabl database connections so we don't incur a performance
    # penalty.

    if multithreaded:
        db_local, cursor_local = cfg.database_connection(
            db_host, db_user, db_pass, db_name, db_port, False, True)

        db_people_local, cursor_people_local = cfg.database_connection(
            db_host_people, db_user_people, db_pass_people, db_name_people,
            db_port_people, True, True)

    else:
        db_local = cfg.db
        cursor_local = cfg.cursor

        db_people_local = cfg.db_people
        cursor_people_local = cfg.cursor_people

    # Read the git log

    git_log = subprocess.Popen([
        "git --git-dir %s log -p -M %s -n1 "
        "--pretty=format:'"
        "author_name: %%an%%nauthor_email: %%ae%%nauthor_date:%%ai%%n"
        "committer_name: %%cn%%ncommitter_email: %%ce%%ncommitter_date: %%ci%%n"
        "parents: %%p%%nEndPatch' " % (repo_loc, commit)
    ],
                               stdout=subprocess.PIPE,
                               shell=True)

    ##

    # Stash the commit we're going to analyze so we can back it out if something
    # goes wrong later.
    store_working_commit = ("INSERT INTO working_commits "
                            "(repos_id,working_commit) VALUES (%s,%s)")

    cursor_local.execute(store_working_commit, (repo_id, commit))
    db_local.commit()

    cfg.log_activity('Debug',
                     'Stored working commit and analyzing : %s' % commit)

    for line in git_log.stdout.read().decode("utf-8", errors="ignore").split(
            os.linesep):
        if len(line) > 0:

            if line.find('author_name:') == 0:
                author_name = line[13:]
                continue

            if line.find('author_email:') == 0:
                author_email = line[14:]
                continue

            if line.find('author_date:') == 0:
                author_date = line[12:22]
                author_timestamp = line[12:]
                continue

            if line.find('committer_name:') == 0:
                committer_name = line[16:]
                continue

            if line.find('committer_email:') == 0:
                committer_email = line[17:]
                continue

            if line.find('committer_date:') == 0:
                committer_date = line[16:26]
                committer_timestamp = line[16:]
                continue

            if line.find('parents:') == 0:
                if len(line[9:].split(' ')) == 2:

                    # We found a merge commit, which won't have a filename
                    filename = '(Merge commit)'

                    added = 0
                    removed = 0
                    whitespace = 0
                continue

            if line.find('--- a/') == 0:
                if filename == '(Deleted) ':
                    filename = filename + line[6:]
                continue

            if line.find('+++ b/') == 0:
                if not filename.find('(Deleted) ') == 0:
                    filename = line[6:]
                continue

            if line.find('rename to ') == 0:
                filename = line[10:]
                continue

            if line.find('deleted file ') == 0:
                filename = '(Deleted) '
                continue

            if line.find('diff --git') == 0:

                # Git only displays the beginning of a file in a patch, not
                # the end. We need some kludgery to discern where one starts
                # and one ends. This is the last line always separating
                # files in commits. But we only want to do it for the second
                # time onward, since the first time we hit this line it'll be
                # right after parsing the header and there won't be any useful
                # information contained in it.

                if not header:

                    store_commit(repo_id, commit, filename, author_name,
                                 author_email, author_date, author_timestamp,
                                 committer_name, committer_email,
                                 committer_date, committer_timestamp, added,
                                 removed, whitespace)

                header = False

                # Reset stats and prepare for the next section
                whitespaceCheck = []
                resetRemovals = True
                filename = ''
                added = 0
                removed = 0
                whitespace = 0
                continue

            # Count additions and removals and look for whitespace changes
            if not header:
                if line[0] == '+':

                    # First check if this is a whitespace change
                    if len(line.strip()) == 1:
                        # Line with zero length
                        whitespace += 1

                    else:
                        # Compare against removals, detect whitespace changes
                        whitespaceChange = False

                        for check in whitespaceCheck:

                            # Mark matches of non-trivial length
                            if line[1:].strip() == check and len(
                                    line[1:].strip()) > 8:
                                whitespaceChange = True

                        if whitespaceChange:
                            # One removal was whitespace, back it out
                            removed -= 1
                            whitespace += 1
                            # Remove the matched line
                            whitespaceCheck.remove(check)

                        else:
                            # Did not trigger whitespace criteria
                            added += 1

                    # Once we hit an addition, next removal line will be new.
                    # At that point, start a new collection for checking.
                    resetRemovals = True

                if line[0] == '-':
                    removed += 1
                    if resetRemovals:
                        whitespaceCheck = []
                        resetRemovals = False
                    # Store the line to check next add lines for a match
                    whitespaceCheck.append(line[1:].strip())

    # Store the last stats from the git log
    store_commit(repo_id, commit, filename, author_name, author_email,
                 author_date, author_timestamp, committer_name,
                 committer_email, committer_date, committer_timestamp, added,
                 removed, whitespace)

    # Remove the working commit.
    try:
        remove_commit = ("DELETE FROM working_commits "
                         "WHERE repos_id = %s AND working_commit = %s")
        cursor_local.execute(remove_commit, (repo_id, commit))
        db_local.commit()

        cfg.log_activity('Debug',
                         'Completed and removed working commit: %s' % commit)
    except:
        cfg.log_activity('Info', 'Working Commit: %s' % commit)
    # If multithreading, clean up the local database

    if multithreaded:
        cursor_local.close()
        cursor_people_local.close()
        db_local.close()
        db_people_local.close()
def time_series_metrics(self, entry_info, repo_id):
    training_days = 365
    repo_id = 25432
    augur_api_host = read_config(
        "Server",
        "host",
        "AUGUR_HOST",
        "0.0.0.0",
        config_file_path='/Users/pratikmishra/augur/augur.config.json')
    augur_api_port = read_config(
        "Server",
        "port",
        "AUGUR_PORT",
        5000,
        config_file_path='/Users/pratikmishra/augur/augur.config.json')
    base_url = 'http://{}:{}/api/unstable/repo-groups/20/repos/{}/'.format(
        augur_api_host, augur_api_port, repo_id)
    begin_date = datetime.datetime.now().replace(
        hour=0, minute=0, second=0,
        microsecond=0) - datetime.timedelta(days=training_days)
    index = pd.date_range(begin_date, periods=training_days, freq='D')
    df = pd.DataFrame(index)
    df.columns = ['date']
    df['date'] = df['date'].astype(str)

    for endpoint in time_series:
        print(endpoint)

        url = base_url + endpoint
        print("Hitting endpoint: " + url + "\n")
        try:
            data = requests.get(url=url).json()
            print(data)
        except:
            data = json.loads(json.dumps(requests.get(url=url).text))

        if len(data) == 0:
            print(
                "Endpoint with url: {} returned an empty response. Moving on to next endpoint.\n"
                .format(url))
            continue

        if 'date' not in data[0]:
            logging.info(
                "Endpoint {} is not a timeseries, moving to next endpoint.\n".
                format(endpoint))
            continue

        metric_df = pd.DataFrame.from_records(data)
        metric_df['date'] = pd.to_datetime(metric_df['date']).dt.date
        metric_df['date'] = metric_df['date'].astype(str)
        extra = ['repo', 'rg']
        for column in metric_df.columns:
            if any(x in column for x in extra):
                metric_df.drop(column, axis=1, inplace=True)

        df = pd.DataFrame(
            pd.merge(df,
                     metric_df.loc[:, metric_df.columns],
                     how='left',
                     on='date'))
        metric_df.drop('date', axis=1, inplace=True)
        df.rename(columns={
            i: "{} _ {}".format(endpoint, i)
            for i in metric_df.columns
        },
                  inplace=True)

    df = df.fillna(0)
    #df = df.groupby(df['date']).sum()

    time_series_LSTM_model(self, entry_info, repo_id, df)