Example #1
0
def setup():
  # Initialise config.ini if it doesn't exist
  if not os.path.exists("config.ini"):
    config.init_default()

  # Ensure tokens exists
  util.get_tokens()

  # Set up the posting timetable
  util.set_session_scheduler()

  # Run the scheduler on its own thread
  util.run_scheduler_async()
Example #2
0
def collect(conn, test=False, max_collect=10):
    ''' Collects social media record of validated study participants.
		- checks for uncollected participant data
		- scrapes, caches, extracts features and writes to db tables
		- note: this should run as a cron job every 15 minutes, but Dreamhost cron is weird. So we usually run manually. '''

    # gets table name / field / datatype for all tables as a Pandas data frame
    table_data = util.get_table_data()

    log_msgs = []
    log_msgs.append('Starting collect\n')

    try:
        query = "SELECT username, user_id, uid, medium FROM usernames WHERE collected=0 AND validated=1 LIMIT {}".format(
            max_collect)
        cur = conn.cursor()
        cur.execute(query)
        rows = cur.fetchall()

        for row in rows:

            username, user_id, unique_id, medium = row
            log_msgs.append('Collect for {} user: {} [ID: {}]'.format(
                medium, username, user_id))

            CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET = util.get_tokens(
                conn, medium, username)

            if medium == "twitter":  # big thanks to Andy Reagan here: https://github.com/andyreagan/tweet-stealing

                twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN,
                                  ACCESS_SECRET)
                collect_twitter(twitter, username, unique_id, conn, table_data)

            elif medium == "instagram":

                instagram = InstagramAPI(access_token=ACCESS_TOKEN,
                                         client_secret=CONSUMER_SECRET)

                collect_instagram(instagram, username, unique_id, conn,
                                  table_data)
        log_msgs.append(
            'Collect log completed without top-level errors (check individual logs for per-basis errors).'
        )

    except Exception, error:
        log_msgs.append('Collect error: {}'.format(str(error)))
Example #3
0
def tz_collect(conn, test=False, max_collect=1000):
    ''' Collects time zone of social media accounts '''

    # gets table name / field / datatype for all tables as a Pandas data frame
    table_data = util.get_table_data()

    log_msgs = []
    log_msgs.append('Starting collect\n')

    user_tz = []
    try:
        query = "SELECT username, user_id, uid, medium FROM usernames WHERE medium='twitter' LIMIT {}".format(
            max_collect)
        cur = conn.cursor()
        cur.execute(query)
        rows = cur.fetchall()
        ct = 0
        for row in rows:

            username, user_id, unique_id, medium = row
            #log_msgs.append('Collect time zone for {} user: {} [ID: {}]'.format(medium, username, user_id))

            CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET = util.get_tokens(
                conn, medium, username)

            if medium == "twitter":  # big thanks to Andy Reagan here: https://github.com/andyreagan/tweet-stealing

                try:
                    twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET,
                                      ACCESS_TOKEN, ACCESS_SECRET)
                    user_info = twitter.verify_credentials()
                    tz = user_info['time_zone']
                    if tz is not None:
                        print username, '::', tz_mapping[tz]
                        ct += 1
                        user_tz.append((tz_mapping[tz], username))
                    else:
                        print username, '::', 'No time zone info'
                    #tz_collect_twitter(twitter, username, unique_id, conn, table_data)
                except Exception, e:
                    print username, ":: ERROR:", str(e)

        #log_msgs.append('Collect log completed without top-level errors (check individual logs for per-basis errors).')

    except Exception, error:
        log_msgs.append('Collect error: {}'.format(str(error)))
Example #4
0
def get_auth(medium, username):
    try:
        conn = util.connect_db()
        callback = acquire_url_base + '?medium={}&username={}'.format(
            medium, username)

        tokens = util.get_tokens(conn, medium)

        if medium == "twitter":

            session['APP_KEY'] = tokens[0]
            session['APP_SECRET'] = tokens[1]

            twitter = Twython(session['APP_KEY'], session['APP_SECRET'])

            auth = twitter.get_authentication_tokens(callback_url=callback)

            session['OAUTH_TOKEN'] = auth['oauth_token']
            session['OAUTH_TOKEN_SECRET'] = auth['oauth_token_secret']

            return redirect(auth['auth_url'])

        elif medium == "instagram":
            CONFIG = {
                'client_id': tokens[2],
                'client_secret': tokens[3],
                'redirect_uri': callback
            }

            api = InstagramAPI(**CONFIG)

            session['APP_KEY'] = tokens[2]
            session['APP_SECRET'] = tokens[3]

            url = api.get_authorize_url(scope=["basic"])

            return redirect(url)

    except Exception, e:
        return str(e)
Example #5
0
def get_auth(medium,username):
	try:
		conn = util.connect_db()
		callback = acquire_url_base+'?medium={}&username={}'.format(medium,username)

		tokens = util.get_tokens(conn, medium)

		if medium == "twitter":

			session['APP_KEY'] = tokens[0]
			session['APP_SECRET'] = tokens[1]

			twitter = Twython(session['APP_KEY'], session['APP_SECRET'])

			
			auth = twitter.get_authentication_tokens(callback_url=callback)

			session['OAUTH_TOKEN'] = auth['oauth_token']
			session['OAUTH_TOKEN_SECRET'] = auth['oauth_token_secret']

			return redirect(auth['auth_url'])

		elif medium == "instagram":
			CONFIG = {
			'client_id': tokens[2],
			'client_secret': tokens[3],
			'redirect_uri': callback
			}

			api = InstagramAPI(**CONFIG)
			
			session['APP_KEY'] = tokens[2]
			session['APP_SECRET'] = tokens[3]

			url = api.get_authorize_url(scope=["basic"])

			return redirect(url)

	except Exception, e:
		return str(e)
 def count_tokens_for_text(self, jsonpage):
     """
     Tokenize each text on a web-page
     write to dict amount of tokens in each lang (russain, minor, trash)
     :param jsonpage: json-result from crawler of concrete web-page
     """
     texts = jsonpage['text']
     url = jsonpage['url']
     domain = self.unify_domain(jsonpage['domain'])
     for text_id in texts:
         try:
             text = texts[text_id]['text']
             lang = texts[text_id]['language']
         except:
             print(self.lang, url, text_id)
             continue
         tokens = get_tokens(text)
         if lang == self.lang:
             self.tokens_per_page[url] = self.tokens_per_page.get(url, 0) + len(tokens)
             self.site_tokens[domain] = self.site_tokens.get(domain, 0) + len(tokens)
         self.tokens_amount[lang] = self.tokens_amount.get(lang, 0) + len(tokens)
     self.site_tokens_per_page[domain].append(self.tokens_per_page[url])
Example #7
0
def collect(conn, test=False, max_collect=10):
	''' Collects social media record of validated study participants.
		- checks for uncollected participant data
		- scrapes, caches, extracts features and writes to db tables
		- note: this should run as a cron job every 15 minutes, but Dreamhost cron is weird. So we usually run manually. '''

	# gets table name / field / datatype for all tables as a Pandas data frame
	table_data = util.get_table_data()

	log_msgs = []
	log_msgs.append('Starting collect\n')

	try:
		query = "SELECT username, user_id, uid, medium FROM usernames WHERE collected=0 AND validated=1 LIMIT {}".format(max_collect)
		cur   = conn.cursor()
		cur.execute(query)
		rows  = cur.fetchall()

		for row in rows:

			username, user_id, unique_id, medium = row
			log_msgs.append('Collect for {} user: {} [ID: {}]'.format(medium, username, user_id))

			CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET = util.get_tokens(conn, medium, username)

			if medium == "twitter": # big thanks to Andy Reagan here: https://github.com/andyreagan/tweet-stealing
				
				twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET)
				collect_twitter(twitter, username, unique_id, conn, table_data)

			elif medium == "instagram":

				instagram = InstagramAPI(access_token=ACCESS_TOKEN, client_secret=CONSUMER_SECRET)
				
				collect_instagram(instagram, username, unique_id, conn, table_data)
		log_msgs.append('Collect log completed without top-level errors (check individual logs for per-basis errors).')

	except Exception, error:
		log_msgs.append('Collect error: {}'.format(str(error)))
Example #8
0
def add_survey_data(conn,
                    control=False,
                    test=False,
                    beginning_of_start_after_id_string='R'):
    ''' Pulls down survey data via Qualtrics API, cleans, stores to SQLite database '''

    if test:
        surveys = ['test_pregnancy', 'test_depression']
    elif control:
        surveys = ['control_twitter', 'control_instagram']
    else:
        surveys = [
            'pregnancy_twitter', 'depression_twitter', 'cancer_twitter',
            'ptsd_twitter', 'pregnancy_instagram', 'depression_instagram',
            'cancer_instagram', 'ptsd_instagram', 'control_twitter',
            'control_instagram'
        ]

    new_data = False

    conditions = get_qualtrics_survey_ids(conn, surveys)
    # Qualtrics credentials
    _, _, user_id, api_token = util.get_tokens(conn, "qualtrics")

    # this CSV keeps track of the last survey response ID we've recorded, that's where we start pulling from qualtrics
    start_after_fname = "survey/TEST__most_recent_ids.csv" if test else "survey/most_recent_ids.csv"
    start_after_url = data_head + start_after_fname
    start_after = pd.read_csv(start_after_url)

    log_msgs = []

    for condition in conditions:

        log_msgs.append('\nStarting add survey data for survey: {}'.format(
            condition['name']))

        # get CSV of survey responses from Qualtrics API call
        data = get_qualtrics_survey_data(start_after,
                                         beginning_of_start_after_id_string,
                                         condition, user_id, api_token)

        # testing
        #print 'DATA: {}'.format(condition['name'])
        #print
        #print data.shape
        #print data
        #print
        #print
        if data.shape[0] > 0:  # if there are new entries, record to SQL
            new_data = True
            clean_qualtrics_data(data, condition)
            write_data_to_study_db(conn, data, condition, start_after)
            update_validated_usernames(conn, data, condition, log_msgs)

    # write updated start_after data to csv
    start_after.to_csv(start_after_url, index=False)

    if new_data:
        log_msgs.append("Survey data added successfully.")
    else:
        log_msgs.append("No new data to add.")

    log_dir = 'addsurveydata/'
    if test:
        log_dir = 'test/' + log_dir

    for msg in log_msgs:
        print msg

    util.log(log_msgs, log_dir)
Example #9
0
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import sys
import schedule
import util
import config

def setup():
  # Initialise config.ini if it doesn't exist
  if not os.path.exists("config.ini"):
    config.init_default()

  # Ensure tokens exists
  util.get_tokens()

  # Set up the posting timetable
  util.set_session_scheduler()

  # Run the scheduler on its own thread
  util.run_scheduler_async()

if __name__ == "__main__":
  # For testing purposes
  if len(sys.argv) > 1:
    if "test" in sys.argv:
      util.testing = True
      util.start_session(wait=False)
    if "fb" in sys.argv:
      util.get_tokens()
    sys.exit()
  setup()
Example #10
0
def parse_to_ast(s, cpp=False, cpp_flags=[], optimise=False):
    '''Build a lexer and parser for the given grammar and then parse the string
    s.'''

    if isinstance(s, str):
        filename = None
    else:
        # Assume s is a file handle.
        filename = s.name
        s = s.read()

    # Pre-process the source with CPP if requested.
    if cpp:
        toolprefix = os.environ.get('TOOLPREFIX', '')
        p = subprocess.Popen(['%scpp' % toolprefix, '-P'] + cpp_flags,
            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        if filename is not None:
            s = ('#line 0 "%s"\n' % filename) + s
        stdout, stderr = p.communicate(s)
        if p.returncode != 0: #pylint: disable=E1101
            raise Exception('CPP failed: %s' % stderr)
        s = stdout

    log.info('Building keyword list...')
    # The ID token rule relies on checking whether the matched token is one of
    # a reserved set of keywords. We need to build that list (in util) before
    # doing any lexing.
    util.reset_keywords()
    util.merge_keywords(IDLKeywords.keywords)
    util.merge_keywords(ADLKeywords.keywords)
    log.info('Done')

    log.info('Building token list...')
    # lex expects a list of valid tokens to be defined in the variable
    # 'tokens'. This is quite annoying because the list of tokens can be
    # deduced automatically (which is what get_tokens() does) and there is no
    # nice way of namespacing tokens. This means that building the parser below
    # will generate spurious warnings about unused tokens if you are only
    # parsing a subset of the CAmkES grammar.
    tokens = util.get_tokens(globals())
    log.info('Done')

    # Lex and Yacc accept a logger to notify the caller of events, but they are
    # really noisy; much more so than is relevant for our purposes. Squash
    # their output unless the user has specifically requested it.
    errorlog = log.log if log.log.getEffectiveLevel() < logging.WARNING \
                       else lex.NullLogger()

    # Enable optimised lexer and parser construction if the caller has
    # requested it. See the PLY docs for the exact effect of this.
    optimize = 1 if optimise else 0

    log.info('Building lexer...')
    try:
        lex.lex(errorlog=errorlog, optimize=optimize).filename = filename
    except Exception as inst:
        raise Exception('Failed to build lexer: %s' % str(inst))
    log.info('Done')

    # yacc by default assumes the starting token is the first one it lexically
    # encounters in globals(). This is almost certainly *not* the behaviour we
    # want so explicitly direct it by specifying the start symbol according to
    # the grammar we are trying to parse.
    start = 'camkes'

    log.info('Building parser...')
    try:
        yacc.yacc(errorlog=errorlog, optimize=optimize)
    except Exception as inst:
        raise Exception('Failed to build parser: %s' % str(inst))
    log.info('Done\n')

    ast = yacc.parse(s)

    # Set the source filename of the AST items if we know it.
    assign_filenames(ast, filename)

    return ast
Example #11
0
from bert4keras.snippets import DataGenerator, sequence_padding
from util import load_data, get_tokens
from config import BaseConfig

# 加载数据集
data = load_data(
    BaseConfig.train_path
)
train_data = [d for i, d in enumerate(data) if i % 10 != 0]
valid_data = [d for i, d in enumerate(data) if i % 10 == 0]
test_data = load_data(
    BaseConfig.test_path
)

# 数据集词频
tokens = get_tokens(data+test_data)


def random_mask(text_ids):
    """随机mask
#     """
    input_ids, output_ids = [], []
    rands = np.random.random(len(text_ids))
    for r, i in zip(rands, text_ids):
        if r < 0.15 * 0.8:
            input_ids.append(4)
            output_ids.append(i)
        elif r < 0.15 * 0.9:
            input_ids.append(i)
            output_ids.append(i)
        elif r < 0.15:
Example #12
0
def parse_to_ast(s, cpp=False, cpp_flags=[], optimise=False):
    '''Build a lexer and parser for the given grammar and then parse the string
    s.'''

    if isinstance(s, str):
        filename = None
    else:
        # Assume s is a file handle.
        filename = s.name
        s = s.read()

    # Pre-process the source with CPP if requested.
    if cpp:
        toolprefix = os.environ.get('TOOLPREFIX', '')
        p = subprocess.Popen(['%scpp' % toolprefix, '-P'] + cpp_flags,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        if filename is not None:
            s = ('#line 0 "%s"\n' % filename) + s
        stdout, stderr = p.communicate(s)
        if p.returncode != 0:  #pylint: disable=E1101
            raise Exception('CPP failed: %s' % stderr)
        s = stdout

    log.info('Building keyword list...')
    # The ID token rule relies on checking whether the matched token is one of
    # a reserved set of keywords. We need to build that list (in util) before
    # doing any lexing.
    util.reset_keywords()
    util.merge_keywords(IDLKeywords.keywords)
    util.merge_keywords(ADLKeywords.keywords)
    log.info('Done')

    log.info('Building token list...')
    # lex expects a list of valid tokens to be defined in the variable
    # 'tokens'. This is quite annoying because the list of tokens can be
    # deduced automatically (which is what get_tokens() does) and there is no
    # nice way of namespacing tokens. This means that building the parser below
    # will generate spurious warnings about unused tokens if you are only
    # parsing a subset of the CAmkES grammar.
    tokens = util.get_tokens(globals())
    log.info('Done')

    # Lex and Yacc accept a logger to notify the caller of events, but they are
    # really noisy; much more so than is relevant for our purposes. Squash
    # their output unless the user has specifically requested it.
    errorlog = log.log if log.log.getEffectiveLevel() < logging.WARNING \
                       else lex.NullLogger()

    # Enable optimised lexer and parser construction if the caller has
    # requested it. See the PLY docs for the exact effect of this.
    optimize = 1 if optimise else 0

    log.info('Building lexer...')
    try:
        lex.lex(errorlog=errorlog, optimize=optimize).filename = filename
    except Exception as inst:
        raise Exception('Failed to build lexer: %s' % str(inst))
    log.info('Done')

    # yacc by default assumes the starting token is the first one it lexically
    # encounters in globals(). This is almost certainly *not* the behaviour we
    # want so explicitly direct it by specifying the start symbol according to
    # the grammar we are trying to parse.
    start = 'camkes'

    log.info('Building parser...')
    try:
        yacc.yacc(errorlog=errorlog, optimize=optimize)
    except Exception as inst:
        raise Exception('Failed to build parser: %s' % str(inst))
    log.info('Done\n')

    ast = yacc.parse(s)

    # Set the source filename of the AST items if we know it.
    assign_filenames(ast, filename)

    return ast
Example #13
0
def add_survey_data(conn, control=False, test=False, beginning_of_start_after_id_string='R'):
	''' Pulls down survey data via Qualtrics API, cleans, stores to SQLite database '''

	if test:
		surveys = ['test_pregnancy',
				   'test_depression']
	elif control:
		surveys = ['control_twitter',
				   'control_instagram']
	else:
		surveys = ['pregnancy_twitter',
				   'depression_twitter',
				   'cancer_twitter',
				   'ptsd_twitter',
				   'pregnancy_instagram',
				   'depression_instagram',
				   'cancer_instagram',
				   'ptsd_instagram',
				   'control_twitter',
				   'control_instagram']

	new_data = False
	
	conditions = get_qualtrics_survey_ids(conn, surveys)
	# Qualtrics credentials
	_,_, user_id, api_token = util.get_tokens(conn, "qualtrics")

	# this CSV keeps track of the last survey response ID we've recorded, that's where we start pulling from qualtrics
	start_after_fname = "survey/TEST__most_recent_ids.csv" if test else "survey/most_recent_ids.csv"
	start_after_url = data_head + start_after_fname
	start_after = pd.read_csv(start_after_url)
	
	log_msgs = []

	for condition in conditions:
		
		log_msgs.append('\nStarting add survey data for survey: {}'.format(condition['name']))
		
		# get CSV of survey responses from Qualtrics API call
		data = get_qualtrics_survey_data(start_after, beginning_of_start_after_id_string, condition, user_id, api_token)
		
		# testing
		#print 'DATA: {}'.format(condition['name'])
		#print
		#print data.shape 
		#print data
		#print
		#print 
		if data.shape[0] > 0: # if there are new entries, record to SQL
			new_data = True			
			clean_qualtrics_data(data, condition)
			write_data_to_study_db(conn, data, condition, start_after)
			update_validated_usernames(conn, data, condition, log_msgs)
			
	# write updated start_after data to csv 
	start_after.to_csv(start_after_url, index=False)

	if new_data:
		log_msgs.append("Survey data added successfully.")
	else:
		log_msgs.append("No new data to add.")

	log_dir = 'addsurveydata/'
	if test:
		log_dir = 'test/' + log_dir

	for msg in log_msgs:
		print msg 

	util.log(log_msgs, log_dir)