def __get_files(message): files = [] for file in message: new_file = { 'file_user_id': file['user_id'], 'file_post_id': file['post_id'], 'file_create_at': unixtime_to_datetime(file['create_at'] / 1000).isoformat(), 'file_update_at': unixtime_to_datetime(file['update_at'] / 1000).isoformat(), 'file_delete_at': None if file['delete_at'] == 0 else unixtime_to_datetime( file['delete_at'] / 1000).isoformat(), 'file_name': file['name'], 'file_extension': file['extension'], 'file_size': file['size'], 'file_type': file['mime_type'], 'file_mini_preview': file['mini_preview'] } files.append(new_file) return files
def add_update_date(self, item): """ All item['updated_on'] from perceval is epoch """ updated = unixtime_to_datetime(item['updated_on']) timestamp = unixtime_to_datetime(item['timestamp']) item['metadata__updated_on'] = updated.isoformat() # Also add timestamp used in incremental enrichment item['metadata__timestamp'] = timestamp.isoformat()
def get_to_date(es_in, in_index, out_index, repository_url, interval): """ Get the appropriate to_date value for incremental insertion. """ study_data_available = False if es_in.indices.exists(index=out_index): last_study_date = es_in.search(index=out_index, body=get_last_study_date( repository_url, interval))["aggregations"]["1"] if "value_as_string" in last_study_date and last_study_date[ "value_as_string"]: study_data_available = True to_date = str_to_datetime(last_study_date["value_as_string"]) elif "value" in last_study_date and last_study_date["value"]: study_data_available = True try: to_date = unixtime_to_datetime(last_study_date["value"]) except Exception: to_date = unixtime_to_datetime(last_study_date["value"] / 1000) if not study_data_available: first_item_date = es_in.search( index=in_index, body=get_first_enriched_date(repository_url) )["aggregations"]["1"]["hits"]["hits"][0]["_source"] to_date = str_to_datetime(first_item_date["metadata__updated_on"]) return to_date
def test_dates(self): """Check if it converts some timestamps to datetime objects.""" date = unixtime_to_datetime(0) expected = datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=dateutil.tz.tzutc()) self.assertIsInstance(date, datetime.datetime) self.assertEqual(date, expected) date = unixtime_to_datetime(1426868155.0) expected = datetime.datetime(2015, 3, 20, 16, 15, 55, tzinfo=dateutil.tz.tzutc()) self.assertIsInstance(date, datetime.datetime) self.assertEqual(date, expected)
def __get_reactions(message): reactions = [] for reaction in message: new_reaction = { 'reaction_user_id': reaction['user_id'], 'reaction_post_id': reaction['post_id'], 'reaction_emoji_name': reaction['emoji_name'], 'reaction_create_at': unixtime_to_datetime(reaction['create_at'] / 1000).isoformat(), 'reaction_update_at': unixtime_to_datetime(reaction['update_at'] / 1000).isoformat(), 'reaction_delete_at': None if reaction['delete_at'] == 0 else unixtime_to_datetime( reaction['delete_at'] / 1000).isoformat() } reactions.append(new_reaction) return reactions
def get_rich_comment(self, item, answer, comment): ecomment = self.get_rich_item(item) # reuse all fields from item ecomment['id'] = str(ecomment['id']) + '_' + str(answer['id']) + '_' + str(comment['id']) ecomment['url'] = item['data']['url'] + "/?answer=" ecomment['url'] += answer['id'] + '#post-id-' + answer['id'] if 'author' in comment: # Not sure if this format is present in some version of askbot ecomment['author_askbot_user_name'] = comment['author']['username'] ecomment['author_askbot_id'] = str(comment['author']['id']) ecomment['author_url'] = ecomment['origin'] + '/users/' ecomment['author_url'] += comment['author']['id'] + '/' + comment['author']['username'] elif 'user_display_name' in comment: ecomment['author_askbot_user_name'] = comment['user_display_name'] ecomment['author_askbot_id'] = str(comment['user_id']) if 'summary' in comment: ecomment['summary'] = comment['summary'] ecomment['score'] = int(comment['score']) if comment['score'] else 0 dfield = 'added_at' if 'comment_added_at' in comment: dfield = 'comment_added_at' if self.sortinghat: if dfield == 'added_at': comment['added_at_date'] = unixtime_to_datetime(float(comment[dfield])).isoformat() else: comment['added_at_date'] = comment[dfield] ecomment.update(self.get_item_sh(comment, date_field="added_at_date")) if ecomment['author_user_name'] != ecomment['author_askbot_user_name']: logger.warning('Bad SH identity in askbot comment. Found %s expecting %s', ecomment['author_user_name'], ecomment['author_askbot_user_name']) if dfield == 'added_at': comment_at = unixtime_to_datetime(float(comment[dfield])) else: comment_at = str_to_datetime(comment[dfield]) added_at = unixtime_to_datetime(float(item['data']["added_at"])) ecomment['time_from_question'] = get_time_diff_days(added_at, comment_at) ecomment['type'] = 'comment' ecomment.update(self.get_grimoire_fields(comment_at.isoformat(), ecomment['type'])) # Clean items fields not valid in comments for f in ['is_askbot_question', 'author_reputation', 'author_badges', 'is_correct', 'comment_count']: if f in ecomment: ecomment.pop(f) return ecomment
def _handle_successful_job(self, event): """Handle successufl jobs""" job = rq.job.Job.fetch(event.job_id, connection=self.conn) result = job.result task_id = job.kwargs['task_id'] try: task = self.registry.get(task_id) except NotFoundError: logger.warning( "Task %s not found; related job #%s will not be rescheduled", task_id, job.id) return if task.archiving_cfg and task.archiving_cfg.fetch_from_archive: logger.info("Job #%s (task: %s) successfully finished", job.id, task_id) task.status = TaskStatus.COMPLETED return if result.nitems > 0: task.backend_args['next_from_date'] = unixtime_to_datetime( result.max_date) if result.offset: task.backend_args['next_offset'] = result.offset delay = task.scheduling_cfg.delay if task.scheduling_cfg else WAIT_FOR_QUEUING self._scheduler.schedule_task(task_id, delay=delay) logger.info("Task: %s re-scheduled", task_id)
def run(self, backend_args, archive_args=None, resume=False): """Run the backend with the given parameters. The method will run the backend assigned to this job, storing the fetched items in a Redis queue. The ongoing status of the job, can be accessed through the property `result`. When `resume` is set, the job will start from the last execution, overwriting 'from_date' and 'offset' parameters, if needed. Setting to `True` the parameter `fetch_from_archive`, items can be fetched from the archive assigned to this job. Any exception during the execution of the process will be raised. :param backend_args: parameters used to un the backend :param archive_args: archive arguments :param resume: fetch items starting where the last execution stopped """ args = backend_args.copy() if archive_args: self.initialize_archive_manager(archive_args['archive_path']) if not resume: max_date = backend_args.get('from_date', None) offset = backend_args.get('offset', None) if max_date: max_date = datetime_to_utc(max_date).timestamp() self._result = JobResult(self.job_id, self.task_id, self.backend, self.category, None, max_date, 0, offset=offset, nresumed=0) else: if self.result.max_date: args['from_date'] = unixtime_to_datetime(self.result.max_date) if self.result.offset: args['offset'] = self.result.offset self._result.nresumed += 1 for item in self._execute(args, archive_args): self.conn.rpush(self.qitems, pickle.dumps(item)) self._result.nitems += 1 self._result.last_uuid = item['uuid'] if not self.result.max_date or self.result.max_date < item[ 'updated_on']: self._result.max_date = item['updated_on'] if 'offset' in item: self._result.offset = item['offset']
def get_rich_item_comments(self, comments, eitem): for comment in comments: ecomment = copy.deepcopy(eitem) created = unixtime_to_datetime(comment['created'] / 1000).isoformat() ecomment['url'] = comment['link'] ecomment['id'] = ecomment['id'] + '_comment_' + str(comment['id']) ecomment['comment'] = comment['comment'] ecomment['like_count'] = comment['like_count'] ecomment['type'] = 'comment' ecomment.update(self.get_grimoire_fields(created, ecomment['type'])) ecomment.pop('is_meetup_meetup') # event host fields: author of the event member = comment['member'] if 'photo' in member: ecomment['member_photo_url'] = member['photo']['photo_link'] ecomment['member_photo_id'] = member['photo']['id'] ecomment['member_photo_type'] = member['photo']['type'] if 'event_context' in member: ecomment['member_is_host'] = member['event_context']['host'] ecomment['member_id'] = member['id'] ecomment['member_name'] = member['name'] ecomment['member_url'] = "https://www.meetup.com/members/" + str(member['id']) if self.sortinghat: ecomment.update(self.get_item_sh(comment)) yield ecomment
def get_rich_item_rsvps(self, rsvps, eitem): for rsvp in rsvps: ersvp = copy.deepcopy(eitem) ersvp['type'] = 'rsvp' created = unixtime_to_datetime(rsvp['created'] / 1000).isoformat() ersvp.update(self.get_grimoire_fields(created, ersvp['type'])) ersvp.pop('is_meetup_meetup') # event host fields: author of the event member = rsvp['member'] if 'photo' in member: ersvp['member_photo_url'] = member['photo']['photo_link'] ersvp['member_photo_id'] = member['photo']['id'] ersvp['member_photo_type'] = member['photo']['type'] ersvp['member_is_host'] = member['event_context']['host'] ersvp['member_id'] = member['id'] ersvp['member_name'] = member['name'] ersvp['member_url'] = "https://www.meetup.com/members/" + str(member['id']) ersvp['id'] = ersvp['id'] + '_rsvp_' + str(rsvp['event']['id']) + "_" + str(member['id']) ersvp['url'] = "https://www.meetup.com/members/" + str(member['id']) ersvp['rsvps_guests'] = rsvp['guests'] ersvp['rsvps_updated'] = rsvp['updated'] ersvp['rsvps_response'] = rsvp['response'] if self.sortinghat: ersvp.update(self.get_item_sh(rsvp)) yield ersvp
def _handle_successful_job(self, job): """Handle successufl jobs""" result = job.result task_id = job.kwargs['task_id'] try: task = self.registry.get(task_id) except NotFoundError: logger.warning("Task %s not found; related job #%s will not be rescheduled", task_id, job.id) return if task.archiving_cfg and task.archiving_cfg.fetch_from_archive: logger.info("Job #%s (task: %s) successfully finished", job.id, task_id) return if result.nitems > 0: task.backend_args['next_from_date'] = unixtime_to_datetime(result.max_date) if result.offset: task.backend_args['next_offset'] = result.offset job_args = self._build_job_arguments(task) delay = task.scheduling_cfg.delay if task.scheduling_cfg else WAIT_FOR_QUEUING job_id = self._scheduler.schedule_job_task(Q_UPDATING_JOBS, task_id, job_args, delay=delay) logger.info("Job #%s (task: %s, old job: %s) re-scheduled", job_id, task_id, job.id)
def __fetch_pull_requests(self, from_date, to_date): """Fetch the pull requests""" raw_pulls = self.client.pulls() for raw_pull in raw_pulls: pulls = json.loads(raw_pull) for pull in pulls['values']: compare_time = unixtime_to_datetime(float(pull['updatedDate'])/1000) if pull['updatedDate'] and to_date < compare_time < from_date: return self.__init_extra_pull_fields(pull) pull['commits_data'] = self.__get_pull_commits(pull['id']) pull['comments_data'] = self.__get_pull_comments(pull['id'], from_date, to_date) for field in TARGET_PULL_FIELDS: if field not in pull: continue if field == 'author': pull[field + '_data'] = self.__get_user(pull[field]['user']['slug']) elif field == 'reviewers': for reviewer in pull[field]: pull[field + '_data'].append(self.__get_user(reviewer['user']['slug'])) yield pull
def __call__(self, event): result = event.payload job_id = event.job_id task_id = event.task_id try: task = self.task_scheduler.registry.get(task_id) except NotFoundError: logger.debug( "Task %s not found; orphan event %s for job #%s ignored", task_id, event.uuid, job_id) return False if task.archiving_cfg and task.archiving_cfg.fetch_from_archive: task.status = TaskStatus.COMPLETED logger.info("Job #%s (task: %s - archiving) finished successfully", job_id, task_id) return True if result.nitems > 0: task.backend_args['next_from_date'] = unixtime_to_datetime( result.max_date) if result.offset: task.backend_args['next_offset'] = result.offset delay = task.scheduling_cfg.delay if task.scheduling_cfg else WAIT_FOR_QUEUING self.task_scheduler.schedule_task(task_id, delay=delay) logger.info("Task: %s re-scheduled", task_id) return True
def get_rich_answer(self, item, answer): eanswer = self.get_rich_item(item) # reuse all fields from item eanswer['id'] = str(eanswer['id']) + '_' + str(answer['id']) eanswer['url'] = item['data']['url'] + "/?answer=" eanswer['url'] += answer['id'] + '#post-id-' + answer['id'] if type(answer['answered_by']) is dict: eanswer['author_askbot_user_name'] = answer['answered_by'][ 'username'] eanswer['author_askbot_id'] = str(answer['answered_by']['id']) eanswer['author_badges'] = answer['answered_by']['badges'] eanswer['author_reputation'] = int( answer['answered_by']['reputation']) eanswer['author_url'] = eanswer['origin'] + '/users/' eanswer['author_url'] += answer['answered_by']['id'] + '/' eanswer['author_url'] += answer['answered_by']['username'] eanswer['summary'] = answer['summary'] eanswer['is_accepted_answer'] = 1 if answer['accepted'] else 0 eanswer['answer_status'] = "accepted" if answer[ 'accepted'] else "not_accepted" eanswer['score'] = int(answer['score']) if answer['score'] else 0 if 'is_correct' in answer: eanswer['is_correct'] = 1 if self.sortinghat: answer['added_at_date'] = unixtime_to_datetime( float(answer["added_at"])).isoformat() eanswer.update(self.get_item_sh(answer, date_field="added_at_date")) if 'author_askbot_user_name' in eanswer and eanswer[ 'author_user_name'] != eanswer['author_askbot_user_name']: logger.warning( '[askbot] Bad SH identity in askbot answer. Found {} expecting {}' .format(eanswer['author_user_name'], eanswer['author_askbot_user_name'])) answer_at = unixtime_to_datetime(float(answer["added_at"])) added_at = unixtime_to_datetime(float(item['data']["added_at"])) eanswer['time_from_question'] = get_time_diff_days(added_at, answer_at) eanswer['type'] = 'answer' eanswer.update( self.get_grimoire_fields(answer_at.isoformat(), eanswer['type'])) # Clean items fields not valid in comments eanswer.pop('is_askbot_question') return eanswer
def __convert_str_to_datetime(text): try: str_date = str_to_datetime(text) except Exception: try: str_date = unixtime_to_datetime(text) except Exception: str_date = None return str_date
def get_time_to_first_attention(self, item): """Get the first date at which a comment was made to the issue by someone other than the user who created the issue """ comment_dates = [ unixtime_to_datetime(float(comment['date_created'])).isoformat() for comment in item['comments'] if item['user']['name'] != comment['user']['name'] ] if comment_dates: return min(comment_dates) return None
def _fix_review_dates(self, item): """Convert dates so ES detect them""" for date_field in ['timestamp', 'createdOn', 'lastUpdated']: if date_field in item.keys(): date_ts = item[date_field] item[date_field] = unixtime_to_datetime(date_ts).isoformat() if 'patchSets' in item.keys(): for patch in item['patchSets']: pdate_ts = patch['createdOn'] patch['createdOn'] = unixtime_to_datetime(pdate_ts).isoformat() if 'approvals' in patch: for approval in patch['approvals']: adate_ts = approval['grantedOn'] approval['grantedOn'] = unixtime_to_datetime(adate_ts).isoformat() if 'comments' in item.keys(): for comment in item['comments']: cdate_ts = comment['timestamp'] comment['timestamp'] = unixtime_to_datetime(cdate_ts).isoformat()
def update(self, item): """Update the summary attributes by accessing the item data. :param item: a Perceval item """ self.fetched += 1 self.last_uuid = item['uuid'] updated_on = unixtime_to_datetime(item['updated_on']) self.min_updated_on = updated_on if not self.min_updated_on else min(self.min_updated_on, updated_on) self.max_updated_on = updated_on if not self.max_updated_on else max(self.max_updated_on, updated_on) self.last_updated_on = updated_on offset = item.get('offset', None) if offset is not None: self.last_offset = offset self.min_offset = offset if self.min_offset is None else min(self.min_offset, offset) self.max_offset = offset if self.max_offset is None else max(self.max_offset, offset)
def __fetch_merge_requests(self, from_date): """Fetch the merge requests.""" fetch_completed = False fetch_from_date = from_date last_date = fetch_from_date while not fetch_completed: try: for mr_item in self.__fetch_merge_requests_data(fetch_from_date): last_date = unixtime_to_datetime(self.metadata_updated_on(mr_item)) yield mr_item except _OutdatedMRsList: fetch_from_date = last_date logger.debug("MRs list is outdated. Recalculating MR list starting on %s", fetch_from_date) else: fetch_completed = True
def get_item_sh(self, item): """ Add sorting hat enrichment fields """ sh_fields = {} # Not shared common get_item_sh because it is pretty specific if 'member' in item: # comment and rsvp identity = self.get_sh_identity(item['member']) elif 'event_hosts' in item: # meetup event identity = self.get_sh_identity(item['event_hosts'][0]) else: return sh_fields created = unixtime_to_datetime(item['created'] / 1000) sh_fields = self.get_item_sh_fields(identity, created) return sh_fields
def test_delete_items_wrong_retention(self): """Test whether no items are deleted if retention isn't defined or negative""" items = json.loads(read_file('data/git.json')) for item in items: timestamp = unixtime_to_datetime(item['timestamp']) item['timestamp'] = timestamp.isoformat() elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11) url = self.es_con + '/' + self.target_index + '/_count' elastic.delete_items(retention_time=None, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 11) elastic.delete_items(retention_time=-1, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 11)
def test_delete_items(self): """Test whether items are correctly deleted""" items = json.loads(read_file('data/git.json')) for item in items: timestamp = unixtime_to_datetime(item['timestamp']) item['timestamp'] = timestamp.isoformat() elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 9) url = self.es_con + '/' + self.target_index + '/_count' elastic.delete_items(retention_time=90000000, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 9) elastic.delete_items(retention_time=1, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 0)
def __get_pull_comments(self, pr_number, from_date, to_date): comments = [] group_pull_comments = self.client.pull_comments(pr_number) for raw_pull_comments in group_pull_comments: group_pull_comments = json.loads(raw_pull_comments) for comment in group_pull_comments['values']: compare_time = unixtime_to_datetime(float(comment['createdDate']) / 1000) if to_date < compare_time < from_date: return comments if 'comment' not in comment: continue if 'user' in comment: comment['user_data'] = self.__get_user(comment['user']['slug']) comments.append(comment) return comments
def get_rich_item(self, item): eitem = {} self.__fill_phab_ids(item['data']) for f in self.RAW_FIELDS_COPY: if f in item: eitem[f] = item[f] else: eitem[f] = None # The real data phab_item = item['data'] # data fields to copy copy_fields = ["phid", "id", "type"] for f in copy_fields: if f in phab_item: eitem[f] = phab_item[f] else: eitem[f] = None # Fields which names are translated map_fields = { "id": "bug_id" } for f in map_fields: if f in phab_item: eitem[map_fields[f]] = phab_item[f] else: eitem[map_fields[f]] = None eitem['num_changes'] = len(phab_item['transactions']) if 'authorData' in phab_item['fields'] and phab_item['fields']['authorData']: # eitem['author_roles'] = ",".join(phab_item['fields']['authorData']['roles']) eitem['author_roles'] = phab_item['fields']['authorData']['roles'] eitem['author_userName'] = phab_item['fields']['authorData']['userName'] eitem['author_realName'] = phab_item['fields']['authorData']['realName'] if 'ownerData' in phab_item['fields'] and phab_item['fields']['ownerData']: eitem['assigned_to_roles'] = phab_item['fields']['ownerData']['roles'] eitem['assigned_to_userName'] = phab_item['fields']['ownerData']['userName'] eitem['assigned_to_realName'] = phab_item['fields']['ownerData']['realName'] eitem['priority'] = phab_item['fields']['priority']['name'] eitem['priority_value'] = phab_item['fields']['priority']['value'] eitem['status'] = phab_item['fields']['status']['name'] eitem['creation_date'] = unixtime_to_datetime(phab_item['fields']['dateCreated']).isoformat() eitem['modification_date'] = unixtime_to_datetime(phab_item['fields']['dateModified']).isoformat() eitem['update_date'] = unixtime_to_datetime(item['updated_on']).isoformat() # raise eitem['main_description'] = phab_item['fields']['name'] eitem['main_description_analyzed'] = eitem['main_description'] eitem['url'] = eitem['origin'] + "/T" + str(eitem['bug_id']) # Time to assign (time to open -> time to assign) eitem['time_to_assign_days'] = None # Time to attend (time to assign-> time to first activity from assignee) eitem['time_to_attend_days'] = None # Time to close (time open -> time last updated for closed tasks) # We can improve it later using events: time open event -> time resolved event eitem['time_to_close_days'] = None if eitem['status'] not in [TASK_OPEN_STATUS, 'Spite', 'Stalled']: eitem['time_to_close_days'] = \ get_time_diff_days(eitem['creation_date'], eitem['update_date']) # Time open (time to open -> now): with painless # Time open using the enrich date. Field needed for filtering. eitem['time_open_days_enrich'] = get_time_diff_days(eitem['creation_date'], datetime_utcnow().replace(tzinfo=None)) # Time from last update (time last update -> now): with painless eitem['changes'] = len(phab_item['transactions']) # Number of assignments changes eitem['changes_assignment'] = 0 # Number of assignees in the changes eitem['changes_assignee_number'] = 0 # List the changes assignees changes_assignee_list = [] first_assignee_phid = None first_assignee_date = None # We need to revert them to go from older to newer phab_item['transactions'].reverse() for change in phab_item['transactions']: change_date = unixtime_to_datetime(float(change['dateCreated'])).isoformat() if change["transactionType"] == "reassign": if not eitem['time_to_assign_days']: eitem['time_to_assign_days'] = get_time_diff_days(eitem['creation_date'], change_date) first_assignee_phid = change['newValue'] first_assignee_date = change_date if 'authorData' in change and change['authorData'] and 'userName' in change['authorData'] \ and change['authorData']['userName'] not in changes_assignee_list: changes_assignee_list.append(change['authorData']['userName']) eitem['changes_assignment'] += 1 if not eitem['time_to_attend_days'] and first_assignee_phid: if 'authorData' in change and change['authorData'] and change['authorData']['phid'] == first_assignee_phid: eitem['time_to_attend_days'] = get_time_diff_days(first_assignee_date, change_date) eitem['changes_assignee_number'] = len(changes_assignee_list) eitem['changes_assignee_list'] = ','.join(changes_assignee_list) eitem['comments'] = 0 for tr in phab_item['transactions']: if tr['comments']: eitem['comments'] += 1 eitem['tags'] = [] for project in phab_item['projects']: if project: eitem['tags'].append(project['name']) eitem['tags_analyzed'] = eitem['tags'] eitem['tags_custom_analyzed'] = eitem['tags'] if self.sortinghat: eitem.update(self.get_item_sh(item, self.roles)) if self.prjs_map: eitem.update(self.get_item_project(eitem)) eitem.update(self.get_grimoire_fields(eitem['creation_date'], "task")) # Support old fields used in maniphest panel T2305 eitem['timeopen_days'] = eitem['time_open_days_enrich'] assigned_to = {} for f in eitem.keys(): if 'ownerData' in f: # Copy all ownerData data fields to assigned_to fields of = f.split('ownerData')[1] assigned_to['assigned_to' + of] = eitem[f] eitem.update(assigned_to) self.add_repository_labels(eitem) self.add_metadata_filter_raw(eitem) return eitem
def get_rich_item(self, item): eitem = {} for f in self.RAW_FIELDS_COPY: if f in item: eitem[f] = item[f] else: eitem[f] = None # The real data message = item['data'] eitem["reply_count"] = 0 # be sure it is always included # data fields to copy copy_fields = [ "text", "type", "reply_count", "subscribed", "subtype", "unread_count", "user" ] for f in copy_fields: if f in message: eitem[f] = message[f] else: eitem[f] = None eitem['text_analyzed'] = eitem['text'] eitem['number_attachs'] = 0 if 'attachments' in message and message['attachments']: eitem['number_attachs'] = len(message['attachments']) eitem['reaction_count'] = 0 if 'reactions' in message: eitem['reaction_count'] = len(message['reactions']) eitem['reactions'] = [] for rdata in message['reactions']: for i in range(0, rdata['count']): eitem['reactions'].append(rdata["name"]) if 'file' in message: eitem['file_type'] = message['file']['pretty_type'] eitem['file_title'] = message['file']['title'] eitem['file_size'] = message['file']['size'] eitem['file_name'] = message['file']['name'] eitem['file_mode'] = message['file']['mode'] eitem['file_is_public'] = message['file']['is_public'] eitem['file_is_external'] = message['file']['is_external'] eitem['file_id'] = message['file']['id'] eitem['file_is_editable'] = message['file']['editable'] if 'user_data' in message: eitem['team_id'] = None # not exists in Mattermost if 'timezone' in message['user_data']: if message['user_data']['timezone']['useAutomaticTimezone']: eitem['tz'] = message['user_data']['timezone'][ 'automaticTimezone'] else: eitem['tz'] = message['user_data']['timezone'][ 'manualTimezone'] # tz must be in -12h to 12h interval, so seconds -> hours if eitem['tz']: eitem['tz'] = round(int(eitem['tz']) / (60 * 60)) if 'is_admin' in message['user_data']: eitem['is_admin'] = message['user_data']['is_admin'] if 'is_owner' in message['user_data']: eitem['is_owner'] = message['user_data']['is_owner'] if 'is_primary_owner' in message['user_data']: eitem['is_primary_owner'] = message['user_data'][ 'is_primary_owner'] if 'profile' in message['user_data']: if 'title' in message['user_data']['profile']: eitem['profile_title'] = message['user_data']['profile'][ 'title'] eitem['avatar'] = message['user_data']['profile']['image_32'] eitem['channel_name'] = message['channel_data']['name'] eitem['channel_id'] = message['channel_data']['id'] eitem['channel_created'] = unixtime_to_datetime( message['channel_data']['create_at'] / 1000).isoformat() eitem['channel_member_count'] = None eitem = self.__convert_booleans(eitem) if self.sortinghat: eitem.update(self.get_item_sh(item)) if self.prjs_map: eitem.update(self.get_item_project(eitem)) eitem.update( self.get_grimoire_fields(item["metadata__updated_on"], "message")) self.add_repository_labels(eitem) self.add_metadata_filter_raw(eitem) return eitem
def get_rich_events(self, item): """ In the events there are some common fields with the task. The name of the field must be the same in the task and in the event so we can filer using it in task and event at the same time. * Fields that don't change: the field does not change with the events in a task so the value is always the same in the events of a task. * Fields that change: the value of teh field changes with events """ # To get values from the task eitem = self.get_rich_item(item) # Fields that don't change never task_fields_nochange = ['author_userName', 'creation_date', 'url', 'id', 'bug_id'] # Follow changes in this fields task_fields_change = ['priority_value', 'status', 'assigned_to_userName', 'tags_custom_analyzed'] task_change = {} for f in task_fields_change: task_change[f] = None task_change['status'] = TASK_OPEN_STATUS task_change['tags_custom_analyzed'] = eitem['tags_custom_analyzed'] # Events are in transactions field (changes in fields) transactions = item['data']['transactions'] if not transactions: return [] for t in transactions: event = {} # Needed for incremental updates from the item event['metadata__updated_on'] = item['metadata__updated_on'] event['origin'] = item['origin'] # Real event data event['transactionID'] = t['transactionID'] event['type'] = t['transactionType'] event['username'] = None if 'authorData' in t and 'userName' in t['authorData']: event['event_author_name'] = t['authorData']['userName'] event['update_date'] = unixtime_to_datetime(float(t['dateCreated'])).isoformat() event['oldValue'] = '' event['newValue'] = '' if event['type'] == 'core:edge': for val in t['oldValue']: if val in self.phab_ids_names: val = self.phab_ids_names[val] event['oldValue'] += "," + val event['oldValue'] = event['oldValue'][1:] # remove first comma for val in t['newValue']: if val in self.phab_ids_names: val = self.phab_ids_names[val] event['newValue'] += "," + val event['newValue'] = event['newValue'][1:] # remove first comma elif event['type'] in ['status', 'description', 'priority', 'reassign', 'title', 'space', 'core:create', 'parent']: # Convert to str so the field is always a string event['oldValue'] = str(t['oldValue']) if event['oldValue'] in self.phab_ids_names: event['oldValue'] = self.phab_ids_names[event['oldValue']] event['newValue'] = str(t['newValue']) if event['newValue'] in self.phab_ids_names: event['newValue'] = self.phab_ids_names[event['newValue']] elif event['type'] == 'core:comment': event['newValue'] = t['comments'] elif event['type'] == 'core:subscribers': event['newValue'] = ",".join(t['newValue']) else: # logger.debug("Event type %s old to new value not supported", t['transactionType']) pass for f in task_fields_nochange: # The field name must be the same than in task for filtering event[f] = eitem[f] # To track history of some fields if event['type'] in ['status']: task_change['status'] = event['newValue'] elif event['type'] == 'priority': task_change['priority'] = event['newValue'] elif event['type'] == 'core:edge': task_change['tags_custom_analyzed'] = [event['newValue']] if event['type'] in ['reassign']: # Try to get the userName and not the user id if event['newValue'] in self.phab_ids_names: task_change['assigned_to_userName'] = self.phab_ids_names[event['newValue']] event['newValue'] = task_change['assigned_to_userName'] else: task_change['assigned_to_userName'] = event['newValue'] if event['oldValue'] in self.phab_ids_names: # Try to get the userName and not the user id event['oldValue'] = self.phab_ids_names[event['oldValue']] for f in task_change: event[f] = task_change[f] yield event
def get_last_item_field(self, field, filters_=[], offset=False): """Find the offset/date of the last item stored in the index. :param field: field with the data :param filters_: additional filters to find the date :param offset: if True, returns the offset field instead of date field """ last_value = None url = self.index_url url += "/_search" if filters_ is None: filters_ = [] terms = [] for filter_ in filters_: if not filter_: continue term = '''{"term" : { "%s" : "%s"}}''' % (filter_['name'], filter_['value']) terms.append(term) data_query = '''"query": {"bool": {"filter": [%s]}},''' % ( ','.join(terms)) data_agg = ''' "aggs": { "1": { "max": { "field": "%s" } } } ''' % field data_json = ''' { "size": 0, %s %s } ''' % (data_query, data_agg) logger.debug("{} {}".format(anonymize_url(url), data_json)) headers = {"Content-Type": "application/json"} res = self.requests.post(url, data=data_json, headers=headers) res.raise_for_status() res_json = res.json() if 'aggregations' in res_json: last_value = res_json["aggregations"]["1"]["value"] if offset: if last_value is not None: last_value = int(last_value) else: if "value_as_string" in res_json["aggregations"]["1"]: last_value = res_json["aggregations"]["1"][ "value_as_string"] last_value = str_to_datetime(last_value) else: last_value = res_json["aggregations"]["1"]["value"] if last_value: try: last_value = unixtime_to_datetime(last_value) except InvalidDateError: # last_value is in microsecs last_value = unixtime_to_datetime(last_value / 1000) return last_value
def get_rich_item(self, item): # We need to detect the category of item: activities (report), events or users eitem = {} if 'time' not in item['data']: logger.warning("[meetup] Not processing %s: no time field", item['uuid']) return eitem for f in self.RAW_FIELDS_COPY: if f in item: eitem[f] = item[f] else: eitem[f] = None event = item['data'] # data fields to copy copy_fields = ["id", "how_to_find_us"] for f in copy_fields: if f in event: eitem[f] = event[f] else: eitem[f] = None # Fields which names are translated map_fields = { "link": "url", "rsvp_limit": "rsvps_limit" } for fn in map_fields: if fn in event: eitem[map_fields[fn]] = event[fn] else: eitem[f] = None # event host fields: author of the event if 'event_hosts' in event: host = event['event_hosts'][0] if 'photo' in host: eitem['member_photo_url'] = host['photo']['photo_link'] eitem['member_photo_id'] = host['photo']['id'] eitem['member_photo_type'] = host['photo']['type'] eitem['member_is_host'] = True eitem['member_id'] = host['id'] eitem['member_name'] = host['name'] eitem['member_url'] = "https://www.meetup.com/members/" + str(host['id']) eitem['event_url'] = event['link'] # data fields to copy with meetup`prefix copy_fields = ["description", "plain_text_description", "name", "status", "utc_offset", "visibility", "waitlist_count", "yes_rsvp_count", "duration", "featured", "rsvpable"] copy_fields_time = ["time", "updated", "created"] for f in copy_fields: if f in event: eitem["meetup_" + f] = event[f] else: eitem[f] = None for f in copy_fields_time: if f in event: eitem["meetup_" + f] = unixtime_to_datetime(event[f] / 1000).isoformat() else: eitem[f] = None rsvps = event.get('rsvps', []) eitem['num_rsvps'] = len(rsvps) eitem['num_comments'] = len(event['comments']) try: if 'time' in event: eitem['time_date'] = unixtime_to_datetime(event['time'] / 1000).isoformat() else: logger.warning("time field nof found in event") return {} except ValueError: logger.warning("Wrong datetime for %s: %s", eitem['url'], event['time']) # If no datetime for the enriched item, it is useless for Kibana return {} if 'venue' in event: venue = event['venue'] copy_fields = ["id", "name", "city", "state", "zip", "country", "localized_country_name", "repinned", "address_1"] for f in copy_fields: if f in venue: eitem["venue_" + f] = venue[f] else: eitem[f] = None eitem['venue_geolocation'] = { "lat": event['venue']['lat'], "lon": event['venue']['lon'], } if 'series' in event: eitem['series_id'] = event['series']['id'] eitem['series_description'] = event['series']['description'] eitem['series_start_date'] = event['series']['start_date'] if 'group' in event: group = event['group'] copy_fields = ["id", "created", "join_mode", "name", "url_name", "who"] for f in copy_fields: if f in group: eitem["group_" + f] = group[f] else: eitem[f] = None eitem['group_geolocation'] = { "lat": group['lat'], "lon": group['lon'], } if eitem['group_created']: eitem['group_created'] = unixtime_to_datetime(eitem['group_created'] / 1000).isoformat() eitem['group_topics'] = [] eitem['group_topics_keys'] = [] if 'topics' in group: group_topics = [topic['name'] for topic in group['topics']] group_topics_keys = [topic['urlkey'] for topic in group['topics']] eitem['group_topics'] = group_topics eitem['group_topics_keys'] = group_topics_keys if len(rsvps) > 0: eitem['group_members'] = rsvps[0]['group']['members'] created = unixtime_to_datetime(event['created'] / 1000).isoformat() eitem['type'] = "meetup" # time_date is when the meetup will take place, the needed one in this index # created is when the meetup entry was created and it is not the interesting date eitem.update(self.get_grimoire_fields(eitem['time_date'], eitem['type'])) if self.sortinghat: eitem.update(self.get_item_sh(event)) if self.prjs_map: eitem.update(self.get_item_project(eitem)) self.add_repository_labels(eitem) self.add_metadata_filter_raw(eitem) return eitem
def get_rich_item(self, item, kind='question', question_tags=None): eitem = {} # Fields common in questions and answers common_fields = ["title", "comment_count", "question_id", "delete_vote_count", "up_vote_count", "down_vote_count", "favorite_count", "view_count", "last_activity_date", "link", "score", "tags"] if kind == 'question': for f in self.RAW_FIELDS_COPY: if f in item: eitem[f] = item[f] else: eitem[f] = None # The real data question = item['data'] eitem["item_id"] = question['question_id'] eitem["type"] = 'question' eitem["author"] = None if 'owner' in question and question['owner']['user_type'] == "does_not_exist": logger.warning("question without owner: %s", question['question_id']) else: eitem["author"] = question['owner']['display_name'] eitem["author_link"] = None if 'link' in question['owner']: eitem["author_link"] = question['owner']['link'] eitem["reputation"] = None if 'reputation' in question['owner']: eitem["author_reputation"] = question['owner']['reputation'] # data fields to copy copy_fields = common_fields + ['answer_count'] for f in copy_fields: if f in question: eitem[f] = question[f] else: eitem[f] = None eitem["question_tags"] = question['tags'] # eitem["question_tags_custom_analyzed"] = question['tags'] # Fields which names are translated map_fields = {"title": "question_title"} for fn in map_fields: eitem[map_fields[fn]] = question[fn] eitem['title_analyzed'] = question['title'] eitem['question_has_accepted_answer'] = 0 eitem['question_accepted_answer_id'] = None if question['answer_count'] >= 1 and 'answers' not in question: logger.warning("Missing answers for question %s", question['question_id']) elif question['answer_count'] >= 1 and 'answers' in question: answers_id = [p['answer_id'] for p in question['answers'] if 'is_accepted' in p and p['is_accepted']] eitem['question_accepted_answer_id'] = answers_id[0] if answers_id else None eitem['question_has_accepted_answer'] = 1 if eitem['question_accepted_answer_id'] else 0 creation_date = unixtime_to_datetime(question["creation_date"]).isoformat() eitem['creation_date'] = creation_date eitem.update(self.get_grimoire_fields(creation_date, "question")) if self.sortinghat: eitem.update(self.get_item_sh(item)) if self.prjs_map: eitem.update(self.get_item_project(eitem)) self.add_repository_labels(eitem) self.add_metadata_filter_raw(eitem) elif kind == 'answer': answer = item eitem["type"] = 'answer' eitem["author"] = answer['owner']['display_name'] eitem["author_link"] = None eitem["item_id"] = answer['answer_id'] if 'link' in answer['owner']: eitem["author_link"] = answer['owner']['link'] if 'reputation' in answer['owner']: eitem["author_reputation"] = answer['owner']['reputation'] # data fields to copy copy_fields = common_fields + ["origin", "tag", "creation_date", "is_accepted", "answer_id"] for f in copy_fields: if f in answer: eitem[f] = answer[f] else: eitem[f] = None eitem['is_accepted_answer'] = 1 if answer['is_accepted'] else 0 eitem['answer_status'] = "accepted" if answer['is_accepted'] else "not_accepted" eitem["question_tags"] = question_tags if 'tags' in answer: eitem["answer_tags"] = answer['tags'] # Fields which names are translated map_fields = {"title": "question_title" } for fn in map_fields: eitem[map_fields[fn]] = answer[fn] creation_date = unixtime_to_datetime(answer["creation_date"]).isoformat() eitem['creation_date'] = creation_date eitem.update(self.get_grimoire_fields(creation_date, "answer")) if self.sortinghat: # date field must be the same than in question to share code answer[self.get_field_date()] = eitem['creation_date'] eitem[self.get_field_date()] = eitem['creation_date'] eitem.update(self.get_item_sh(answer)) if self.prjs_map: eitem.update(self.get_item_project(eitem)) return eitem
def get_rich_item(self, item): eitem = {} for f in self.RAW_FIELDS_COPY: if f in item: eitem[f] = item[f] else: eitem[f] = None question = item['data'] if 'accepted_answer_id' not in question: question['accepted_answer_id'] = None # Fields that are the same in item and eitem copy_fields = ["id", "url", "title", "summary", "score"] for f in copy_fields: if f in question: eitem[f] = question[f] else: eitem[f] = None # Fields which names are translated map_fields = { "title": "question_title", "answer_count": "question_answer_count", "view_count": "question_view_count", "answer_ids": "question_answer_ids" } for fn in map_fields: if fn in question: eitem[map_fields[fn]] = question[fn] else: eitem[map_fields[fn]] = None # Cast id of question to string eitem['id'] = str(eitem['id']) eitem['score'] = int(eitem['score']) if eitem['score'] else 0 # First answer time added_at = unixtime_to_datetime(float(question["added_at"])) eitem['time_to_reply'] = None if 'answers' in question: # answers ordered by time first_answer_time = unixtime_to_datetime( float(question['answers'][0]["added_at"])) eitem['time_to_reply'] = get_time_diff_days( added_at, first_answer_time) eitem['question_has_accepted_answer'] = 1 if question[ 'accepted_answer_id'] else 0 eitem['question_accepted_answer_id'] = question[ 'accepted_answer_id'] else: eitem['question_has_accepted_answer'] = 0 if question['author'] and type(question['author']) is dict: eitem['author_askbot_user_name'] = question['author']['username'] eitem['author_askbot_id'] = str(question['author']['id']) eitem['author_badges'] = question['author']['badges'] eitem['author_reputation'] = int(question['author']['reputation']) eitem['author_url'] = eitem['origin'] + '/users/' eitem['author_url'] += question['author']['id'] + '/' + question[ 'author']['username'] eitem['question_last_activity_at'] = unixtime_to_datetime( float(question['last_activity_at'])).isoformat() eitem['question_last_activity_by_id'] = question['last_activity_by'][ 'id'] eitem['question_last_activity_by_username'] = question[ 'last_activity_by']['username'] # A list can be used directly to filter in kibana eitem['question_tags'] = question['tags'] eitem['question_answer_ids'] = question['answer_ids'] eitem['comment_count'] = 0 if 'answers' in question: eitem['comment_count'] = sum([ len(a['comments']) if 'comments' in a else 0 for a in question['answers'] ]) if self.sortinghat: eitem.update(self.get_item_sh(item)) if self.prjs_map: eitem.update(self.get_item_project(eitem)) eitem["type"] = "question" eitem.update( self.get_grimoire_fields(added_at.isoformat(), eitem["type"])) self.add_repository_labels(eitem) self.add_metadata_filter_raw(eitem) return eitem