Beispiel #1
0
def main():
    """ set up logging """
    now = datetime.datetime.now(timezone('US/Pacific')).strftime("%Y-%m-%d")
    logger = logging_config.get_logger(_dir=now,
                                       name="ph_feature_extraction",
                                       console_level=logging.ERROR)
    """ set up environment vars """
    os.environ['DB_CONFIG'] = os.path.abspath('db/cfg/dbsetup.yml')
    session = setup_db(os.environ['DB_CONFIG'])
    os.environ['FEATURES'] = os.path.abspath('features.csv')

    entries = extract_all_features(session, logger)
    entries = clean_all_features(entries)
    write_all_features(os.environ['FEATURES'], entries)
Beispiel #2
0
def main():
    """ set up logging """
    now = datetime.datetime.now(timezone('US/Pacific')).strftime("%Y-%m-%d")
    logger = logging_config.get_logger(_dir=now,
                                       name="ph_feature_extraction",
                                       console_level=logging.ERROR)
    """ set up environment vars """
    os.environ['DB_CONFIG'] = os.path.abspath('db/cfg/dbsetup.yml')
    session = setup_db(os.environ['DB_CONFIG'])
    csv_directory = os.getcwd()[:-8] + 'dataset\\'
    features_not_discretized = 'features_not_discretized.csv'
    csv_path = os.path.join(csv_directory, features_not_discretized)
    os.environ['FEATURES'] = os.path.abspath(csv_path)
    """ Extract all features (Presentation, Reputation, Time, Affect and Linguistic) """
    entries = extract_all_features(session, logger)
    entries = clean_features(entries)
    write_all_features(os.environ['FEATURES'], entries)

    realize_topic_modeling(csv_path)

    discretize_continuous_variables(csv_path)
Beispiel #3
0
    return ProductHuntClient(key, secret, uri, token)


if __name__ == '__main__':
    now = datetime.datetime.now(timezone('US/Pacific')).strftime("%Y-%m-%d")
    now_dt = datetime.datetime.strptime(now, '%Y-%m-%d').date()
    day = None
    day_dt = None
    newest = False
    update = False
    pid = None
    phm = None
    ph_credentials = 'credentials_miner.yml'
    help_string = 'Usage:\n\tpython ph_miner.py [-d|--day=<YYYY-MM-DD>] [-p|--postid=N] [-n|--newest] [-u|--update] ' \
                  '[-c|--credentials=credentials.yml] [--h|--help]'
    logger = logging_config.get_logger(_dir=now, name="ph_miner", console_level=logging.INFO)

    exit_code = 0
    try:
        opts, _ = getopt(sys.argv[1:], "hd:p:nuc:", ["help", "day=", "postid=", "newest", "update", "credentials="])
        for opt, arg in opts:
            if opt in ("-h", "--help"):
                print(help_string)
                exit(0)
            elif opt in ("-d", "--day"):
                day = arg
                day_dt = datetime.datetime.strptime(day, "%Y-%m-%d").date()
            elif opt in ("-p", "--postid"):
                pid = int(arg)
            elif opt in ("-n", "--newest"):
                newest = True
Beispiel #4
0
import logging
import os

from pytz import timezone
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from twisted.internet import defer
from twisted.internet import reactor

from db.orm.tables import Post
from logger import logging_config
from review_user_crawler.spiders.producthunt import ReviewSpider, UserSpider

logger = logging_config.get_logger(_dir=datetime.datetime.now(
    timezone('US/Pacific')).strftime("%Y-%m-%d"),
                                   name="ph_crawler",
                                   console_level=logging.INFO)
"""
Code to work around the twisted.internet.error.ReactorNotRestartable limitation of the Twisted library
See here: https://doc.scrapy.org/en/latest/topics/practices.html
"""


class CrawlersLauncher:
    def __init__(self, session):
        self.session = session
        self.review_urls = []
        self.profile_urls = []
        settings = self.__configure_project()
        self.runner = CrawlerRunner(settings=settings)
        if not e[12]:
            has_website = 0
        e[12] = has_website
        _cleaned_entries.append(e)
    return _cleaned_entries


def write_all_features(outfile, _entries):
    writer = CsvWriter(outfile)
    header = ['post_id', 'is_featured', 'score', 'created_at_day', 'created_at_daytime', 'hunter_id',
              'hunter_followers', 'hunter_has_twitter', 'hunter_has_website', 'maker_id', 'maker_followers',
              'maker_has_twitter', 'maker_has_website']
    writer.writerow(header)
    writer.writerows(_entries)
    writer.close()


if __name__ == '__main__':
    """ set up logging """
    now = datetime.datetime.now(timezone('US/Pacific')).strftime("%Y-%m-%d")
    logger = logging_config.get_logger(_dir=now, name="ph_feature_extraction", console_level=logging.ERROR)

    """ set up environment vars """
    os.environ['DB_CONFIG'] = os.path.abspath('db/cfg/dbsetup.yml')
    session = setup_db(os.environ['DB_CONFIG'])
    os.environ['FEATURES'] = os.path.abspath('temp.csv')

    entries = extract_all_features(session, logger)
    entries = clean_all_features(entries)
    write_all_features(os.environ['FEATURES'], entries)
Beispiel #6
0
            liwc_errors = False
            if all_emails:
                resume_month = already_parsed_uid_project_month(aliases, p.name)
                liwc_errors = get_score_by_month(uid, p.name, all_emails, resume_month, nlon, nlon_model)
                del all_emails
            else:
                logger.debug(
                    'No emails from %s <%s> to project \'%s\' mailing lists' % (uid, alias_email_addresses, p.name))
            logger.info('Done processing project %s' % p.name)
            if liwc_errors:
                return True
    return False


if __name__ == '__main__':
    logger = logging_config.get_logger('big5_personality', console_level=logging.DEBUG)
    SessionWrapper.load_config('../db/cfg/setup.yml')
    session = SessionWrapper.new(init=True)

    if len(sys.argv) >= 2:
        tool = sys.argv[1]
    else:
        logger.error('Missing mandatory first param for tool: \'liwc15\' or \'p_insights\' expected')
        sys.exit(-1)

    if len(sys.argv) > 2 and sys.argv[2] == 'reset':
        reset_personality_table()
    try:
        """ boolean var storing presence of liwc errors """
        liwc_errors = main()
        if tool == 'liwc15':
        logger.info('Already parsed user: %s' % username)
        session.rollback()


def already_parsed_users():
    return session.query(UsersLocation).count()


def reset_users_location_table():
    session.query(UsersLocation).delete()
    session.commit()
    logger.info('Done resetting table')


if __name__ == '__main__':
    logger = logging_config.get_logger('users_location', console_level=logging.DEBUG)
    SessionWrapper.load_config('../db/cfg/setup.yml')
    session = SessionWrapper.new(init=True)

    if len(sys.argv) > 1 and sys.argv[1] == 'reset':
        reset_users_location_table()

    try:
        already_parsed_users = already_parsed_users()

        token = open("github-api-tokens.txt", "r").readline()
        g = Github(token)

        count_users = 0
        for user in get_github_users():
            if count_users == already_parsed_users:
Beispiel #8
0
                               UsersLocation.username).filter(
                                   UsersLocation.location.isnot(None)).all()
    id = -offset
    for g in githubbers:
        id -= 1
        continent = geo.extract_continent(unidecode(g.location))
        if continent:
            row = UsersRegionId(id=id,
                                continent=continent,
                                username=g.username,
                                email=g.email,
                                name=g.name)
            session.add(row)
    session.commit()


EMAILERS_OFFSET = 900000
GITHUBBERS_OFFSET = 1000000
if __name__ == '__main__':
    logger = logging_config.get_logger('unmask_aliases')

    SessionWrapper.load_config('../db/cfg/setup.yml')
    session = SessionWrapper.new(init=True)
    setup_emailers_id(EMAILERS_OFFSET)
    setup_githubbers_id_location(GITHUBBERS_OFFSET)

    aliases, everyone = unmask(sys.argv[1:])
    logger.info('Done, looking for unmatched users')
    unmatched = find_missing_aliases(aliases, everyone)
    logger.info('Done: unmatched %s' % len(unmatched))
Beispiel #9
0

def get_already_parsed_projects():
    seen = set()
    SessionWrapper.load_config('../db/cfg/setup.yml')
    s = SessionWrapper.new(init=True)
    res = s.query(PullRequest.slug).distinct()
    for r in res:
        seen.add(r.slug)
    return seen


if __name__ == '__main__':
    pr_file = 'tmp_pullrequests.csv'
    # comment_file = 'tmp_comments.csv'
    logger = logging_config.get_logger('pr_extractor')
    try:
        tokens = Tokens()
        tokens_iter = tokens.iterator()
        manager = Manager()
        tokens_queue = manager.Queue()
        for token in tokens_iter:
            tokens_queue.put(token)
        tokens_map = manager.dict()

        extractor = PrAndCommentExtractor(tokens, tokens_queue, tokens_map)
        print("Retrieving the list of cloned GitHub project")
        slugs = get_github_slugs(sys.argv[1])
        print("%s" % len(slugs))
        print("Retrieving the list of project already analyzed")
        extractor.seen = get_already_parsed_projects()
Beispiel #10
0
            reward=result["reward"].replace(",", "."))

    session.add(ls)
    session.commit()
    logger.info('Imported results from file: \'%s\'' % result['Filename'])


def reset_table():
    if dictionary == '2007':
        session.query(Liwc2007Scores).delete()
    elif dictionary == '2015':
        session.query(Liwc2015Scores).delete()


if __name__ == '__main__':
    logger = logging_config.get_logger('save_liwc_scores',
                                       console_level=logging.DEBUG)
    SessionWrapper.load_config('../db/cfg/setup.yml')
    session = SessionWrapper.new(init=True)

    if len(sys.argv) >= 3:
        dictionary = sys.argv[2]
    else:
        logger.error('Missing mandatory params')
        sys.exit(-1)

    reset_table()

    try:
        with open(sys.argv[1]) as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
Beispiel #11
0
    dictionary.
    :return: dictionary of unmasked dev ids (key) and continent (value)
    """
    res = session.query(UsersRegionId.id, UsersRegionId.continent).all()
    uids_cont = dict()
    for _r in res:
        try:
            unmasked_id = alias_map[_r.id]
            uids_cont.update({unmasked_id: _r.continent})
        except KeyError:
            pass
    return uids_cont


if __name__ == '__main__':
    logger = logging_config.get_logger('export', console_level=logging.DEBUG)
    SessionWrapper.load_config('../db/cfg/setup.yml')
    session = SessionWrapper.new(init=True)
    alias_map = load_alias_map('../unmasking/idm/dict/alias_map.dict')
    uids_continent = load_continent_info()

    if len(sys.argv) >= 2:
        tool = sys.argv[1]
    elif len(sys.argv) < 2:
        logger.error(
            'Missing mandatory first param for tool: \'liwc07\', \'liwc15\', or \'p_insights\' expected'
        )
        sys.exit(-1)

    save_personality_results()
    save_commit_results()
    with open(filename, "rb") as f:
        unpickler = pickle.Unpickler(f)
        aliases = unpickler.load()
    return aliases


def get_alias_ids(_map, uid):
    aliases = set()
    for key in _map.keys():
        if _map[key] == uid and key != uid:
            aliases.add(key)
    return list(aliases)


def reset_commit_history_table():
    session.query(CommitHistoryDevProject).delete()
    session.commit()


if __name__ == '__main__':
    logger = logging_config.get_logger('commit_history')
    SessionWrapper.load_config('../db/cfg/setup.yml')
    session = SessionWrapper.new(init=True)
    reset_commit_history_table()

    alias_map = load_alias_map('../unmasking/idm/dict/alias_map.dict')
    try:
        main()
    except KeyboardInterrupt:
        logger.error('Received Ctrl-C or other break signal. Exiting.')
                    format(name, email, db_repo.slug))

        db_repo.min_commit = min_commit
        db_repo.max_commit = max_commit
        db_repo.total_commits = total_commits
        session.add(db_repo)

        session.commit()
        logger.info('Done')
        return slug

    except Exception as e:
        logger.error(msg="{0}: unknown error:\t{1}".format(slug, e))
        traceback.print_exc()
    finally:
        return slug


if __name__ == '__main__':
    logging.basicConfig()
    logger = logging_config.get_logger('commit_analyzer', logging.DEBUG)

    # create a new session and init db tables
    SessionWrapper.load_config('../db/cfg/setup.yml')
    SessionWrapper.new(init=True)

    repos = [d for d in os.listdir(os.path.abspath(sys.argv[1]))]

    for r in repos:
        parse_commits(r, repos_folder=sys.argv[1])
Beispiel #14
0
import nltk

from logger import logging_config
from unmasking.geolite2 import cities_contries_continents, cities, countries, continents, SelectedFields

logger = logging_config.get_logger('geo')


def extract_continent(location):
    nes = find_places(location)

    city = None
    likely_city = False
    country = None
    likely_country = False
    continent = None
    likely_continent = False
    for n in nes:
        n = n.lower()
        if n in cities and not likely_city:
            city = n
            likely_city = True
        if n in countries and not likely_country:
            country = n
            likely_country = True
        if n in continents and not likely_continent:
            continent = n
            likely_continent = True

    if not continent:
        if country: