def setup(): # Initialise config.ini if it doesn't exist if not os.path.exists("config.ini"): config.init_default() # Ensure tokens exists util.get_tokens() # Set up the posting timetable util.set_session_scheduler() # Run the scheduler on its own thread util.run_scheduler_async()
def collect(conn, test=False, max_collect=10): ''' Collects social media record of validated study participants. - checks for uncollected participant data - scrapes, caches, extracts features and writes to db tables - note: this should run as a cron job every 15 minutes, but Dreamhost cron is weird. So we usually run manually. ''' # gets table name / field / datatype for all tables as a Pandas data frame table_data = util.get_table_data() log_msgs = [] log_msgs.append('Starting collect\n') try: query = "SELECT username, user_id, uid, medium FROM usernames WHERE collected=0 AND validated=1 LIMIT {}".format( max_collect) cur = conn.cursor() cur.execute(query) rows = cur.fetchall() for row in rows: username, user_id, unique_id, medium = row log_msgs.append('Collect for {} user: {} [ID: {}]'.format( medium, username, user_id)) CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET = util.get_tokens( conn, medium, username) if medium == "twitter": # big thanks to Andy Reagan here: https://github.com/andyreagan/tweet-stealing twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET) collect_twitter(twitter, username, unique_id, conn, table_data) elif medium == "instagram": instagram = InstagramAPI(access_token=ACCESS_TOKEN, client_secret=CONSUMER_SECRET) collect_instagram(instagram, username, unique_id, conn, table_data) log_msgs.append( 'Collect log completed without top-level errors (check individual logs for per-basis errors).' ) except Exception, error: log_msgs.append('Collect error: {}'.format(str(error)))
def tz_collect(conn, test=False, max_collect=1000): ''' Collects time zone of social media accounts ''' # gets table name / field / datatype for all tables as a Pandas data frame table_data = util.get_table_data() log_msgs = [] log_msgs.append('Starting collect\n') user_tz = [] try: query = "SELECT username, user_id, uid, medium FROM usernames WHERE medium='twitter' LIMIT {}".format( max_collect) cur = conn.cursor() cur.execute(query) rows = cur.fetchall() ct = 0 for row in rows: username, user_id, unique_id, medium = row #log_msgs.append('Collect time zone for {} user: {} [ID: {}]'.format(medium, username, user_id)) CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET = util.get_tokens( conn, medium, username) if medium == "twitter": # big thanks to Andy Reagan here: https://github.com/andyreagan/tweet-stealing try: twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET) user_info = twitter.verify_credentials() tz = user_info['time_zone'] if tz is not None: print username, '::', tz_mapping[tz] ct += 1 user_tz.append((tz_mapping[tz], username)) else: print username, '::', 'No time zone info' #tz_collect_twitter(twitter, username, unique_id, conn, table_data) except Exception, e: print username, ":: ERROR:", str(e) #log_msgs.append('Collect log completed without top-level errors (check individual logs for per-basis errors).') except Exception, error: log_msgs.append('Collect error: {}'.format(str(error)))
def get_auth(medium, username): try: conn = util.connect_db() callback = acquire_url_base + '?medium={}&username={}'.format( medium, username) tokens = util.get_tokens(conn, medium) if medium == "twitter": session['APP_KEY'] = tokens[0] session['APP_SECRET'] = tokens[1] twitter = Twython(session['APP_KEY'], session['APP_SECRET']) auth = twitter.get_authentication_tokens(callback_url=callback) session['OAUTH_TOKEN'] = auth['oauth_token'] session['OAUTH_TOKEN_SECRET'] = auth['oauth_token_secret'] return redirect(auth['auth_url']) elif medium == "instagram": CONFIG = { 'client_id': tokens[2], 'client_secret': tokens[3], 'redirect_uri': callback } api = InstagramAPI(**CONFIG) session['APP_KEY'] = tokens[2] session['APP_SECRET'] = tokens[3] url = api.get_authorize_url(scope=["basic"]) return redirect(url) except Exception, e: return str(e)
def get_auth(medium,username): try: conn = util.connect_db() callback = acquire_url_base+'?medium={}&username={}'.format(medium,username) tokens = util.get_tokens(conn, medium) if medium == "twitter": session['APP_KEY'] = tokens[0] session['APP_SECRET'] = tokens[1] twitter = Twython(session['APP_KEY'], session['APP_SECRET']) auth = twitter.get_authentication_tokens(callback_url=callback) session['OAUTH_TOKEN'] = auth['oauth_token'] session['OAUTH_TOKEN_SECRET'] = auth['oauth_token_secret'] return redirect(auth['auth_url']) elif medium == "instagram": CONFIG = { 'client_id': tokens[2], 'client_secret': tokens[3], 'redirect_uri': callback } api = InstagramAPI(**CONFIG) session['APP_KEY'] = tokens[2] session['APP_SECRET'] = tokens[3] url = api.get_authorize_url(scope=["basic"]) return redirect(url) except Exception, e: return str(e)
def count_tokens_for_text(self, jsonpage): """ Tokenize each text on a web-page write to dict amount of tokens in each lang (russain, minor, trash) :param jsonpage: json-result from crawler of concrete web-page """ texts = jsonpage['text'] url = jsonpage['url'] domain = self.unify_domain(jsonpage['domain']) for text_id in texts: try: text = texts[text_id]['text'] lang = texts[text_id]['language'] except: print(self.lang, url, text_id) continue tokens = get_tokens(text) if lang == self.lang: self.tokens_per_page[url] = self.tokens_per_page.get(url, 0) + len(tokens) self.site_tokens[domain] = self.site_tokens.get(domain, 0) + len(tokens) self.tokens_amount[lang] = self.tokens_amount.get(lang, 0) + len(tokens) self.site_tokens_per_page[domain].append(self.tokens_per_page[url])
def collect(conn, test=False, max_collect=10): ''' Collects social media record of validated study participants. - checks for uncollected participant data - scrapes, caches, extracts features and writes to db tables - note: this should run as a cron job every 15 minutes, but Dreamhost cron is weird. So we usually run manually. ''' # gets table name / field / datatype for all tables as a Pandas data frame table_data = util.get_table_data() log_msgs = [] log_msgs.append('Starting collect\n') try: query = "SELECT username, user_id, uid, medium FROM usernames WHERE collected=0 AND validated=1 LIMIT {}".format(max_collect) cur = conn.cursor() cur.execute(query) rows = cur.fetchall() for row in rows: username, user_id, unique_id, medium = row log_msgs.append('Collect for {} user: {} [ID: {}]'.format(medium, username, user_id)) CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET = util.get_tokens(conn, medium, username) if medium == "twitter": # big thanks to Andy Reagan here: https://github.com/andyreagan/tweet-stealing twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET) collect_twitter(twitter, username, unique_id, conn, table_data) elif medium == "instagram": instagram = InstagramAPI(access_token=ACCESS_TOKEN, client_secret=CONSUMER_SECRET) collect_instagram(instagram, username, unique_id, conn, table_data) log_msgs.append('Collect log completed without top-level errors (check individual logs for per-basis errors).') except Exception, error: log_msgs.append('Collect error: {}'.format(str(error)))
def add_survey_data(conn, control=False, test=False, beginning_of_start_after_id_string='R'): ''' Pulls down survey data via Qualtrics API, cleans, stores to SQLite database ''' if test: surveys = ['test_pregnancy', 'test_depression'] elif control: surveys = ['control_twitter', 'control_instagram'] else: surveys = [ 'pregnancy_twitter', 'depression_twitter', 'cancer_twitter', 'ptsd_twitter', 'pregnancy_instagram', 'depression_instagram', 'cancer_instagram', 'ptsd_instagram', 'control_twitter', 'control_instagram' ] new_data = False conditions = get_qualtrics_survey_ids(conn, surveys) # Qualtrics credentials _, _, user_id, api_token = util.get_tokens(conn, "qualtrics") # this CSV keeps track of the last survey response ID we've recorded, that's where we start pulling from qualtrics start_after_fname = "survey/TEST__most_recent_ids.csv" if test else "survey/most_recent_ids.csv" start_after_url = data_head + start_after_fname start_after = pd.read_csv(start_after_url) log_msgs = [] for condition in conditions: log_msgs.append('\nStarting add survey data for survey: {}'.format( condition['name'])) # get CSV of survey responses from Qualtrics API call data = get_qualtrics_survey_data(start_after, beginning_of_start_after_id_string, condition, user_id, api_token) # testing #print 'DATA: {}'.format(condition['name']) #print #print data.shape #print data #print #print if data.shape[0] > 0: # if there are new entries, record to SQL new_data = True clean_qualtrics_data(data, condition) write_data_to_study_db(conn, data, condition, start_after) update_validated_usernames(conn, data, condition, log_msgs) # write updated start_after data to csv start_after.to_csv(start_after_url, index=False) if new_data: log_msgs.append("Survey data added successfully.") else: log_msgs.append("No new data to add.") log_dir = 'addsurveydata/' if test: log_dir = 'test/' + log_dir for msg in log_msgs: print msg util.log(log_msgs, log_dir)
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import sys import schedule import util import config def setup(): # Initialise config.ini if it doesn't exist if not os.path.exists("config.ini"): config.init_default() # Ensure tokens exists util.get_tokens() # Set up the posting timetable util.set_session_scheduler() # Run the scheduler on its own thread util.run_scheduler_async() if __name__ == "__main__": # For testing purposes if len(sys.argv) > 1: if "test" in sys.argv: util.testing = True util.start_session(wait=False) if "fb" in sys.argv: util.get_tokens() sys.exit() setup()
def parse_to_ast(s, cpp=False, cpp_flags=[], optimise=False): '''Build a lexer and parser for the given grammar and then parse the string s.''' if isinstance(s, str): filename = None else: # Assume s is a file handle. filename = s.name s = s.read() # Pre-process the source with CPP if requested. if cpp: toolprefix = os.environ.get('TOOLPREFIX', '') p = subprocess.Popen(['%scpp' % toolprefix, '-P'] + cpp_flags, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if filename is not None: s = ('#line 0 "%s"\n' % filename) + s stdout, stderr = p.communicate(s) if p.returncode != 0: #pylint: disable=E1101 raise Exception('CPP failed: %s' % stderr) s = stdout log.info('Building keyword list...') # The ID token rule relies on checking whether the matched token is one of # a reserved set of keywords. We need to build that list (in util) before # doing any lexing. util.reset_keywords() util.merge_keywords(IDLKeywords.keywords) util.merge_keywords(ADLKeywords.keywords) log.info('Done') log.info('Building token list...') # lex expects a list of valid tokens to be defined in the variable # 'tokens'. This is quite annoying because the list of tokens can be # deduced automatically (which is what get_tokens() does) and there is no # nice way of namespacing tokens. This means that building the parser below # will generate spurious warnings about unused tokens if you are only # parsing a subset of the CAmkES grammar. tokens = util.get_tokens(globals()) log.info('Done') # Lex and Yacc accept a logger to notify the caller of events, but they are # really noisy; much more so than is relevant for our purposes. Squash # their output unless the user has specifically requested it. errorlog = log.log if log.log.getEffectiveLevel() < logging.WARNING \ else lex.NullLogger() # Enable optimised lexer and parser construction if the caller has # requested it. See the PLY docs for the exact effect of this. optimize = 1 if optimise else 0 log.info('Building lexer...') try: lex.lex(errorlog=errorlog, optimize=optimize).filename = filename except Exception as inst: raise Exception('Failed to build lexer: %s' % str(inst)) log.info('Done') # yacc by default assumes the starting token is the first one it lexically # encounters in globals(). This is almost certainly *not* the behaviour we # want so explicitly direct it by specifying the start symbol according to # the grammar we are trying to parse. start = 'camkes' log.info('Building parser...') try: yacc.yacc(errorlog=errorlog, optimize=optimize) except Exception as inst: raise Exception('Failed to build parser: %s' % str(inst)) log.info('Done\n') ast = yacc.parse(s) # Set the source filename of the AST items if we know it. assign_filenames(ast, filename) return ast
from bert4keras.snippets import DataGenerator, sequence_padding from util import load_data, get_tokens from config import BaseConfig # 加载数据集 data = load_data( BaseConfig.train_path ) train_data = [d for i, d in enumerate(data) if i % 10 != 0] valid_data = [d for i, d in enumerate(data) if i % 10 == 0] test_data = load_data( BaseConfig.test_path ) # 数据集词频 tokens = get_tokens(data+test_data) def random_mask(text_ids): """随机mask # """ input_ids, output_ids = [], [] rands = np.random.random(len(text_ids)) for r, i in zip(rands, text_ids): if r < 0.15 * 0.8: input_ids.append(4) output_ids.append(i) elif r < 0.15 * 0.9: input_ids.append(i) output_ids.append(i) elif r < 0.15:
def add_survey_data(conn, control=False, test=False, beginning_of_start_after_id_string='R'): ''' Pulls down survey data via Qualtrics API, cleans, stores to SQLite database ''' if test: surveys = ['test_pregnancy', 'test_depression'] elif control: surveys = ['control_twitter', 'control_instagram'] else: surveys = ['pregnancy_twitter', 'depression_twitter', 'cancer_twitter', 'ptsd_twitter', 'pregnancy_instagram', 'depression_instagram', 'cancer_instagram', 'ptsd_instagram', 'control_twitter', 'control_instagram'] new_data = False conditions = get_qualtrics_survey_ids(conn, surveys) # Qualtrics credentials _,_, user_id, api_token = util.get_tokens(conn, "qualtrics") # this CSV keeps track of the last survey response ID we've recorded, that's where we start pulling from qualtrics start_after_fname = "survey/TEST__most_recent_ids.csv" if test else "survey/most_recent_ids.csv" start_after_url = data_head + start_after_fname start_after = pd.read_csv(start_after_url) log_msgs = [] for condition in conditions: log_msgs.append('\nStarting add survey data for survey: {}'.format(condition['name'])) # get CSV of survey responses from Qualtrics API call data = get_qualtrics_survey_data(start_after, beginning_of_start_after_id_string, condition, user_id, api_token) # testing #print 'DATA: {}'.format(condition['name']) #print #print data.shape #print data #print #print if data.shape[0] > 0: # if there are new entries, record to SQL new_data = True clean_qualtrics_data(data, condition) write_data_to_study_db(conn, data, condition, start_after) update_validated_usernames(conn, data, condition, log_msgs) # write updated start_after data to csv start_after.to_csv(start_after_url, index=False) if new_data: log_msgs.append("Survey data added successfully.") else: log_msgs.append("No new data to add.") log_dir = 'addsurveydata/' if test: log_dir = 'test/' + log_dir for msg in log_msgs: print msg util.log(log_msgs, log_dir)