class CooccurrencePipeline(PipelineModule): def __init__(self, output=None): self.output = output self.cooccur_graph = Graph() self.module_type = enumModuleType(enumModuleType.Document) self.module_processing_type = \ enumModuleProcessingType(enumModuleProcessingType.PostProcess) def process(self, source, data, attribute="categories"): if attribute in data.document: d = data.document[attribute] for v1 in d: for v2 in d: self.cooccur_graph.inc_edge(v1, v2) # method that gets run after all data has been processed # TODO: look into optimizing this, seems inefficient, written in derp-mode def post_process(self): return self.cooccur_graph def write(self): if self.output != None: f = open(self.output, 'w') f.write(self.cooccur_graph.as_edgelist()) f.close()
def __init__(self, output=None): self.output = output self.cooccur_graph = Graph() self.module_type = enumModuleType(enumModuleType.Document) self.module_processing_type = \ enumModuleProcessingType(enumModuleProcessingType.PostProcess)