def remove_unwanted(src, dest): print "Processing %s" % src data = read_vm_file(src) with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer, target, num_clicks in data: try: r = normalize_url(referrer) t = normalize_url(target) if not should_skip_host(t): writer.writerow([r, t, num_clicks]) except: print "Couldn't normalize. Skipping." print referrer print target
def prune_news_dataset(news_sources_file): f = open(news_sources_file, 'r') news_urls = set() for line in f: if '://' in line: host = line[line.index('://') + 3:] else: host = line host = host.strip().split('/')[0] host = normalize_url(host) if host in news_urls or host in UNWANTED_URLS or fnmatches_multiple( EXCEPTION_PATTERNS, host): continue news_urls.add(host) for host in sorted(list(news_urls)): disregard = False for parent in parents(host): if parent in news_urls or parent in UNWANTED_URLS: disregard = True break if not disregard: print host
def filter_news_junk(src, dest): print "Processing %s" % src data = read_vm_file(src) with open(dest, 'w') as destf: writer = csv.writer(destf, delimiter="\t") for referrer, target, num_clicks in data: target = normalize_url(target) if not should_skip_host(target): writer.writerow([referrer, target, num_clicks])
def read_raw_aol_data(filepath): clicks = [] with open(filepath, 'r') as f: reader = csv.reader(f, delimiter="\t") reader.next() # skip header for row in reader: try: user = int(row[0]) dest = normalize_url(row[4]) clicks.append((user, dest)) except: print "Failed:", row return clicks
def read_raw_twitter_data(filepath): tweets = [] with open(filepath, 'r') as f: for line in f: try: parts = line.split("|") user = int(parts[0]) dtstr = parts[1] friends = int(parts[-2]) followers = int(parts[-3]) dest = "".join(parts[2:-3]) dest = normalize_url(dest) tweets.append((user, dest, followers, friends)) except: print "Could not parse line: %s\t%s" % (line, filepath) return tweets
def prune_news_dataset(news_sources_file): f = open(news_sources_file, 'r') news_urls = set() for line in f: host = line.strip().split('/')[0] host = normalize_url(host) if host in news_urls or host in UNWANTED_URLS or fnmatches_multiple(EXCEPTION_PATTERNS, host): continue news_urls.add(host) for host in sorted(list(news_urls)): disregard = False for parent in parents(host): if parent in news_urls or parent in UNWANTED_URLS: disregard = True break if not disregard: print host
def validate_requests(src, valid_dest, invalid_dest): print "Processing", src # extract a timestamp from from the filename to validate against file_dt = parse_dt_from_filename(src) with open(valid_dest, 'w') as valid_destf: valid_writer = csv.writer(valid_destf, delimiter="\t") with open(invalid_dest, 'w') as invalid_destf: with open(src, 'r') as f: stage = READING_REFERRER # a request is [timestamp, referrer, target, direction, agent] request = [None, None, None, None, None] # discard the header line f.readline() out_of_sync = False prev = None prevprev = None for rawline in f: line = rawline.strip() if line in SKIP_LINES: continue # if somehow we got out of sync reading the file # try to find a referrer line if out_of_sync: stage = READING_REFERRER # reading the first line of a request consisting of: XXXXAD[R] where # XXXX is the timestamp in little endian order # A is the agent and can be either 'B' for browser or '?' for unknown # D is the direction, 'I' for traffic going into IU, 'O' for traffic going outside IU # R is the referrer if stage == READING_REFERRER: # it's possible to have new lines between records if line == '': continue for s in ['BI', 'BO', '?I', '?O']: idx = line.find(s) if idx != -1: out_of_sync = False try: request[TIMESTAMP] = struct.unpack('<I', line[:idx])[0] except: request[TIMESTAMP] = time.mktime(file_dt.timetuple()) request[AGENT] = line[idx] request[DIRECTION] = line[idx+1] request[REFERRER] = normalize_url(line[idx+2:]).split('/')[0] break else: out_of_sync = True stage = READING_TARGET # reading the requested host elif stage == READING_TARGET: request[TARGET] = normalize_url(line.split('/')[0]) stage = READING_FILEPATH # reading the requested file # after this step, the reading of the request is done and it # can be matched to any supplied criteria elif stage == READING_FILEPATH: is_valid = True for val in request: if val is None: is_valid = False break if is_valid: dt = datetime.datetime.fromtimestamp(request[TIMESTAMP]) # if the difference between the file's and the record's timestamps # is more than one day, use the file time stamp tdelta = dt - file_dt if tdelta.days < 0: tdelta = -tdelta if tdelta.seconds / 60.0 / 60.0 > 1: request[TIMESTAMP] = time.mktime(file_dt.timetuple()) is_valid = (request[AGENT] == 'B' or request[AGENT] == '?') and \ (request[DIRECTION] == 'I' or request[DIRECTION] == 'O') if is_valid: if request[AGENT] == 'B' and request[DIRECTION] == 'O': valid_writer.writerow([ request[TIMESTAMP], request[REFERRER], request[TARGET] ]) else: invalid_destf.write("%s\n" % prevprev) invalid_destf.write("%s\n" % prev) invalid_destf.write("%s\n" % line) # reset the variables describing the request request = [None, None, None, None, None] stage = READING_REFERRER # we should never get here else: raise ValueError("Invalid stage: %d" % (stage)) prevprev = prev prev = line