def run(self): tree = AssociationTree(split_domain) for sc in imap(SITE_COUNT_PARSER, self.args.site_count): tree.grow(sc, domain(sc.site)) companies = imap(COMPANY_PARSER, self.args.companies) s2c = tree.map(companies, lambda c: domain(c.hp)) cnt = count_by_key(s for l in d_itervalues(s2c) for s in l) for company, sites in d_iteritems(s2c): for site in sites: if cnt[site] == 1: self.out('\t'.join([company.permalink, site.site]))
def produce_features_weka(self): root, _ = os.path.splitext(self.args.config) arff_f = '.'.join([root, 'arff']) relation = os.path.basename(root) names = [ 'site', 'code' ] names.extend(imap(str, self.indicators)) arff.dump(arff_f, self._iter_rows(), relation=root, names=names)
def names(self): classes = ', '.join( imap(str, sorted(set(d_itervalues(self.code_to_cls)))) ) attributes = '\n'.join('{0}:\tTrue, False.'.format(str(i)) for i in self.indicators) s = NAMES_TPL.format(classes, attributes) return s
def iter_sites_w_company(directory_or_file): contents = iter_files_content(directory_or_file) for swc in imap(SITES_W_COMPANY_PARSER, contents): ranks = map(operator.attrgetter('rank'), swc.ranking) index = pandas.DatetimeIndex(map(operator.attrgetter('tstamp'), swc.ranking)) ts = pandas.Series(ranks, index=index) tstamp = pandas.Timestamp(swc.tstamp) yield (swc.site, ts, swc.company, swc.code, tstamp)
def __call__(self, parser, namespace, value, option_string=None): if os.path.isdir(value): files = [] make_abs = functools.partial(os.path.join, value) for path in imap(make_abs, os.listdir(value)): if os.path.isfile(path): files.append(path) else: files = [value, ] setattr(namespace, self.dest, fileinput.input(files))
def ids_to_samples(self): if self.args.samples: directory = self.args.samples else: directory = self.config['samples'] if os.path.isfile(directory): files = [directory, ] else: files = [] make_abs = functools.partial(os.path.join, directory) for path in imap(make_abs, os.listdir(directory)): if os.path.isfile(path): files.append(path) samples = dict() for f in files: for site, tstamp, code in csv_file_reader(f, delimiter='\t'): tstamp = parse_tstamp(tstamp) site_id = self.sites_to_ids[site] samples[site_id] = (site, tstamp, code) return samples
def generate_missing_indicators(self): to_produce_q = Queue() self.out('Generating the following indicators:') for i in self.indicators: if not i.produced: to_produce_q.put(i) self.out(str(i)) if not self.query_user_permission('Proceed?'): raise SystemExit('Canceled due to user interaction') threads = [] for _ in range(min(self.num_threads, to_produce_q.qsize())): t = IndicatorUpdater(self.ids_to_samples, to_produce_q, StreamAlexaIndicatorsCaller(self.cmd_path)) t.start() threads.append(t) for t in threads: t.join() if any(imap(operator.attrgetter('failed'), threads)): raise Exception('At least one thread died!') to_produce_q.join()
def make_fr_per_date_plot(companies, plot_file=None): contents = iter_files_content(companies) d = collections.defaultdict(list) min_date = datetime.date(2011, 3, 1) months = set() for c in imap(FLATTENED_PARSER, contents): if c.tstamp >= min_date: d[c.code].append(matplotlib.dates.date2num(c.tstamp)) months.add(datetime.date(c.tstamp.year, c.tstamp.month, 1)) months = sorted(months) right_border = months[-1] + datetime.timedelta(31) right_border = datetime.date(right_border.year, right_border.month, 1) months.append(right_border) fig = plt.figure(figsize=(4*1.4, 3*1.4)) ax = fig.add_subplot(111) ax.hist(d.values(), label=map(str.title, d.keys()), bins=matplotlib.dates.date2num(months)) ax.set_xlim(matplotlib.dates.date2num(months[0]), matplotlib.dates.date2num(months[-1])) ax.legend() ax.xaxis.set_major_locator( matplotlib.dates.MonthLocator(bymonthday=15, interval=2) ) ax.xaxis.set_major_formatter( matplotlib.ticker.FuncFormatter( lambda d, _: matplotlib.dates.num2date(d).strftime('%B %Y') ) ) fig.autofmt_xdate() ax.set_ylabel('Number of Funding Rounds') ax.grid(True, axis='y') if plot_file: fig.savefig(plot_file) return fig