class GephiPipeline(object): def __init__(self, gephi_uri, gephi_ws): self.logger = logging.getLogger(type(self).__name__) self.gephi_uri = gephi_uri self.gephi_ws = gephi_ws self.nodes = set() @classmethod def from_crawler(cls, crawler): return cls( gephi_uri=crawler.settings.get('GEPHI_URI'), gephi_ws=crawler.settings.get('GEPHI_WS') ) def open_spider(self, spider): self.gephi = GephiStreamerManager(iGephiUrl=self.gephi_uri, iGephiWorkspace=self.gephi_ws) self.logger.info('GephiStream connected {}'.format(self.gephi_uri)) def close_spider(self, spider): pass def process_item(self, item, spider): patent_args = {'size': 5, 'red': 1, 'green': 0, 'blue': 0} patent_node = Node(item['publication_number'], **patent_args) patent_node.property['type'] = 'patent' patent_node.property['title'] = item.get('title') patent_node.property['filing_date'] = item.get('filing_date') patent_node.property['publication_date'] = item.get('publication_date') patent_node.property['priority_date'] = item.get('priority_date') patent_node.property['grant_date'] = item.get('grant_date') patent_node.property['pdf'] = item.get('pdf') if item['publication_number'] in self.nodes: self.gephi.change_node(patent_node) else: self.gephi.add_node(patent_node) link_args = {'size': 5, 'red': 0, 'green': 0, 'blue': 1} for citation in item.get('citations', []): citation_node = Node(citation, **link_args) citation_node.property['type'] = 'link' self.gephi.add_node(citation_node) self.gephi.add_edge(Edge(patent_node, citation_node, True)) self.nodes.add(citation) for cited_by in item.get('cited_by', []): cited_by_node = Node(cited_by, **link_args) cited_by_node.property['type'] = 'link' self.gephi.add_node(cited_by_node) self.gephi.add_edge(Edge(cited_by_node, patent_node, True)) self.nodes.add(cited_by) entity_args = {'size': 5, 'red': 0, 'green': 1, 'blue': 0} entities = set(item.get('inventors', []) + item.get('assignees', [])) for entity in entities: entity_node = Node(entity, **entity_args) entity_node.property['type'] = 'entity' self.gephi.add_node(entity_node) self.gephi.add_edge(Edge(entity_node, patent_node, True)) self.logger.info('Publishing item {}'.format(item['publication_number'])) try: self.gephi.commit() except ConnectionError, e: self.logger.error(e) self.nodes.add(item['publication_number']) return item