def __init__(self, reader, k=None): self.reader = reader self.datatxt = DataTXT() self.snippets = [] self.pages = [] self.topic_set = defaultdict(int) self.rel_matrix = [] self.k = k self.topics = [] seed((1000, 2000))
def __init__(self, items): self.texts = [x[0] for x in items] self.urls = [x[1] for x in items] self.user = [x[2] for x in items] self.datatxt = DataTXT()
class Annotator(object): def __init__(self, items): self.texts = [x[0] for x in items] self.urls = [x[1] for x in items] self.user = [x[2] for x in items] self.datatxt = DataTXT() def annotate(self, test=None): """ >>> a = Annotator([('mozilla funziona google chrome', '', '')]) >>> a.annotate(test="annotations") [u'http://en.wikipedia.org/wiki/Google_Chrome', u'http://en.wikipedia.org/wiki/Mozilla'] >>> a = Annotator([('@mozilla funziona google chrome', '', '')]) >>> a.annotate(test="annotations") [u'http://en.wikipedia.org/wiki/Google_Chrome'] >>> a = Annotator([('mozilla funziona @google_chrome', '', '')]) >>> a.annotate(test="annotations") [u'http://en.wikipedia.org/wiki/Mozilla'] >>> a = Annotator([('@mozilla funziona @google_chrome', '', '')]) >>> a.annotate(test="annotations") [] >>> a = Annotator([('@google funziona http://google.com', '', '')]) >>> a.annotate(test="annotations") [] """ t_set = set() tweets = defaultdict(list) annotated_texts_tmp = [] rew = "((https?|ftp)://|(www|ftp)\.)[a-z0-9-]+(\.[a-z0-9-]+)+([/?].*)?" for i, text in enumerate(self.texts): text_ann = [] for x in text.split(' '): if x.startswith('@'): # username x = "_" * len(x) else: # website match = re.search(rew, x) if match: x = "_" * len(x) text_ann.append(x) text_ann = u" ".join(text_ann) annotation = self.datatxt.nex(text_ann) if annotation is None: continue d = { 'text': text, 'url': self.urls[i], 'user': self.user[i], 'annotations': annotation['annotations'].values(), } nice_page = {} for topics, ann in annotation['annotations'].items(): page = topics if '://it.' in topics: # italian entity page = self.datatxt.interWikiRecon.get_inter_wikilinks( topics).get('EN') if page is None: page = topics else: nice_page[page] = ann['confidence'] else: nice_page[page] = ann['confidence'] tweets[page].append(d) del annotation['annotations'] annotation['annotations'] = nice_page annotated_texts_tmp.append(annotation) annotated_texts = { x['id']: x for x in annotated_texts_tmp if x is not None } for k, v in annotated_texts.iteritems(): del v['id'] if test == "annotations": return annotated_texts.values()[0]['annotations'].keys() return annotated_texts, tweets
class BaseClusterify(object): def __init__(self, reader, k=None): self.reader = reader self.datatxt = DataTXT() self.snippets = [] self.pages = [] self.topic_set = defaultdict(int) self.rel_matrix = [] self.k = k self.topics = [] seed((1000, 2000)) def annotate(self): annotator = Annotator(self.reader.texts()) self.snippets, tweets = annotator.annotate() return tweets def _generate_topic_set(self): logger.info("snippet set") logger.info(self.snippets) for _, snippet in self.snippets.iteritems(): for page, _ in snippet.get('annotations').iteritems(): self.topic_set[page] += 1. def _generate_adjagent_matrix(self): self._generate_topic_set() logger.info("topic set") logger.info(self.topic_set) topics = self.topic_set.keys() rel = zeros((len(topics), len(topics))) BATCH_SIZE = 10 for offsetX in xrange(0, len(topics), BATCH_SIZE): topicsX = topics[offsetX: offsetX + BATCH_SIZE] for offsetY in xrange(offsetX, len(topics), BATCH_SIZE): topicsY = topics[offsetY: offsetY + BATCH_SIZE] rel_values = self.datatxt.rel(topicsX, topicsY) for i in xrange(0, len(topicsX)): for j in xrange(0, len(topicsY)): rel[offsetX + i][offsetY + j] = rel_values[i][j] rel[offsetY + j][offsetX + i] = rel_values[i][j] self.rel_matrix = rel return rel def _generate_cluster(self, ids): response = {} for i, id in enumerate(ids): if id in response: response[id].append(list(self.topic_set)[i]) else: response[id] = [list(self.topic_set)[i]] return response def _generate_output_response(self, response): if not self.topic_set: self._generate_topic_set() response_dict = [] for cluster in response.values(): tmp = {} for topic in cluster: tmp[topic] = self.topic_set[topic] ** .5 response_dict.append(tmp) return { 'clusters': response_dict } def do_cluster(self): raise NotImplemented