def collect_cards(requester, db_board, board_name, board_members, all_lists): trello_client = init_trello_client(requester) # make an instance of py-trello's Board object to have access to relevant api calls board = Board(client=trello_client, board_id=db_board.trello_board_id) board.name = board_name # load all checklists checklists = defaultdict(list) for cl in board.get_checklists(): checklists[cl.card_id].append(cl) open_cards = collect_cards_internal(requester, board, board_members, checklists, all_lists, card_status='open') # request closed cards separately to have a better chance to index all open cards # (thus avoiding hitting rate limits already in open cards indexing) collect_cards_internal(requester, board, board_members, checklists, all_lists, card_status='closed') # update board lists with a list of cards lists_with_cards = defaultdict(list) for ac in open_cards: lists_with_cards[ac.idList].append({ 'id': ac.id, 'name': ac.name, 'pos': ac.pos, 'url': ac.url }) board_lists = db_board.trello_content.get('lists', []) for bl in board_lists: bl.update({ 'cards': sorted(lists_with_cards[bl['id']], key=itemgetter('pos')) }) db_board.trello_content = { 'description': db_board.trello_content.get('description'), 'lists': board_lists } db_board.save() algolia_engine.sync(db_board, add=False)
def _process_customer(requester, customer, mailboxes, folders, users): if customer.id is None or (customer.emails is None and customer.fullname is None): # can't use customer with no data logger.debug("Customer '%s' for user '%s' cannot be used - no data", (customer.id or customer.fullname), requester.username) return db_customer, created = Document.objects.get_or_create( helpscout_customer_id=customer.id, requester=requester, user_id=requester.id) db_customer.helpscout_name = customer.fullname logger.debug("Processing Helpscout customer '%s' for user '%s'", customer.fullname, requester.username) new_updated = customer.modifiedat new_updated_ts = parse_dt(new_updated).timestamp() if not created and db_customer.last_updated_ts: new_updated_ts = db_customer.last_updated_ts \ if db_customer.last_updated_ts > new_updated_ts else new_updated_ts db_customer.last_updated = datetime.utcfromtimestamp( new_updated_ts).isoformat() + 'Z' db_customer.last_updated_ts = new_updated_ts db_customer.helpscout_title = 'User: {}'.format(customer.fullname) db_customer.webview_link = 'https://secure.helpscout.net/customer/{}/0/'.format( customer.id) db_customer.primary_keywords = HELPSCOUT_KEYWORDS['primary'] db_customer.secondary_keywords = HELPSCOUT_KEYWORDS['secondary'] db_customer.helpscout_company = customer.organization db_customer.helpscout_emails = ', '.join( e.get('value') for e in customer.emails if 'value' in e) if customer.emails else None db_customer.save() algolia_engine.sync(db_customer, add=created) subtask(process_customer).delay(requester, db_customer, mailboxes, folders, users)
def collect_cards_internal(requester, board, board_members, checklists, lists, card_status): collected_cards = [] last_card_id = None while True: filters = {'filter': 'all', 'fields': 'all', 'limit': '1000'} if last_card_id: # Trello api supports paging by using the id of the last card in the previous batch as 'before' parameter filters['before'] = last_card_id cards = board.get_cards(filters=filters, card_filter=card_status) for card in cards: db_card, created = Document.objects.get_or_create( trello_board_id=board.id, trello_card_id=card.id, requester=requester, user_id=requester.id ) card_last_activity = card.raw.get('dateLastActivity') last_activity = parse_dt(card_last_activity).isoformat() last_activity_ts = int(parse_dt(card_last_activity).timestamp()) collected_cards.append(card) if not created and db_card.last_updated_ts and db_card.last_updated_ts >= last_activity_ts: logger.debug("Trello card '%s' for user '%s' hasn't changed", card.name[:50], requester.username) continue logger.debug("Processing card '%s' for user '%s'", card.name[:50], requester.username) db_card.primary_keywords = TRELLO_PRIMARY_KEYWORDS db_card.secondary_keywords = TRELLO_SECONDARY_KEYWORDS['card'] db_card.last_updated = last_activity db_card.last_updated_ts = last_activity_ts db_card.trello_title = 'Card: {}'.format(card.name) db_card.webview_link = card.url db_card.trello_content = { 'description': _to_html(card.description), 'checklists': [ { 'id': cl.id, 'name': cl.name, 'items': cl.items } for cl in checklists[card.id] ] } db_card.trello_card_status = 'Archived' if card.closed else 'Open' db_card.trello_card_members = [board_members.get(m) for m in card.idMembers if m in board_members] db_card.trello_board_name = board.name db_card.trello_list = lists.get(card.idList) db_card.last_synced = get_utc_timestamp() db_card.download_status = Document.READY db_card.save() algolia_engine.sync(db_card, add=created) last_card_id = card.id if len(cards) < 1000: break return collected_cards
def collect_files(requester, repo_id, repo_name, repo_url, default_branch, enrichment_delay): """ List all files in a repo - should be called once, after first sync of a repo. Subsequent syncing is handled via collect_commits() function. Note that this uses Github's API call for retrieval of recursive trees: https://developer.github.com/v3/git/trees/#get-a-tree-recursively This API call returns a flat list of all files and saves us many API calls that would be needed to recursively fetch files for each repo directory. But it may not work well for very big repos (> 5k files), becuase Github API has a limit of number of elements it will return in one call. """ github_client = init_github_client(requester) repo = github_client.get_repo(full_name_or_id=repo_name) new_files = [] for f in repo.get_git_tree(sha=repo.default_branch, recursive=True).tree: db_file, created = Document.objects.get_or_create( github_file_id=_compute_sha('{}{}'.format(repo_id, f.path)), github_repo_id=repo_id, requester=requester, user_id=requester.id) if created: new_files.append({ 'sha': f.sha, 'filename': f.path, 'action': 'modified', 'type': f.type }) db_file.primary_keywords = GITHUB_PRIMARY_KEYWORDS db_file.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['file'] # set the timestamp to 0 (epoch) to signal that we don't know the update timestamp db_file.last_updated_ts = 0 db_file.last_updated = datetime.utcfromtimestamp( 0).isoformat() + 'Z' db_file.github_title = '{}: {}'.format( 'Dir' if f.type == 'tree' else 'File', f.path.split('/')[-1]) db_file.github_file_path = f.path db_file.github_repo_full_name = repo_name db_file.webview_link = '{}/blob/{}/{}'.format( repo_url, default_branch, f.path) algolia_engine.sync(db_file, add=created) db_file.last_synced = get_utc_timestamp() db_file.download_status = Document.PENDING db_file.save() # run enrich_files() for all new_files in chunks of 50 items i = 0 for ff in [new_files[x:x + 50] for x in range(0, len(new_files), 50)]: i = i + 1 subtask(enrich_files).apply_async( args=[requester, ff, repo.id, repo_name, repo_url, default_branch], countdown=enrichment_delay + (240 * i))
def collect_deals(requester): pipe_client = init_pipedrive_client(requester) stages = {s.id: s.name for s in pipe_client.Stage.fetch_all()} users = {u.id: u for u in pipe_client.User.fetch_all()} # fallback domain org_domain = None for deal in pipe_client.Deal.fetch_all(): if deal.org_id: org_domain = deal.org_id.get('cc_email', '').split('@')[0] if not org_domain: # cannot associate a deal to a company logger.debug("Deal '%s' for user '%s' cannot be associated to a company", deal.title, requester.username) continue db_deal, created = Document.objects.get_or_create( pipedrive_deal_id=deal.id, requester=requester, user_id=requester.id ) if not created and db_deal.last_updated_ts: # compare timestamps and skip the deal if it hasn't been updated if db_deal.last_updated_ts >= parse_dt(deal.update_time).timestamp(): logger.debug("Deal '%s' for user '%s' hasn't changed", deal.title, requester.username) continue db_deal.primary_keywords = PIPEDRIVE_KEYWORDS['primary'] db_deal.secondary_keywords = PIPEDRIVE_KEYWORDS['secondary'] db_deal.pipedrive_title = deal.title logger.debug("Processing deal '%s' for user '%s'", deal.title, requester.username) db_deal.pipedrive_deal_company = deal.org_id.get('name') if deal.org_id else None db_deal.pipedrive_deal_value = deal.value db_deal.pipedrive_deal_currency = deal.currency db_deal.pipedrive_deal_status = deal.status db_deal.pipedrive_deal_stage = stages.get(deal.stage_id) db_deal.webview_link = 'https://{}.pipedrive.com/deal/{}'.format(org_domain, deal.id) db_deal.last_updated = parse_dt(deal.update_time).isoformat() + 'Z' db_deal.last_updated_ts = parse_dt(deal.update_time).timestamp() db_deal.pipedrive_content = build_deal_content(deal, users, org_domain, pipe_client) db_deal.last_synced = get_utc_timestamp() db_deal.download_status = Document.READY db_deal.save() algolia_engine.sync(db_deal, add=created) # add sleep of one second to avoid breaking API rate limits time.sleep(1)
def download_gdrive_document(doc, access_token, refresh_token): doc.download_status = Document.PROCESSING doc.save() try: service = connect_to_gdrive(access_token, refresh_token) request = None if doc.mime_type.startswith('application/vnd.google-apps.'): export_mime = 'text/csv' if 'spreadsheet' in doc.mime_type else 'text/plain' request = service.files().export_media(fileId=doc.document_id, mimeType=export_mime) else: request = service.files().get_media(fileId=doc.document_id) response = request.execute() logger.info("Done downloading {} [{}]".format(doc.title, doc.document_id)) content = cut_utf_string(response.decode('UTF-8', errors='replace'), 9000, step=10) doc.content = content doc.last_synced = get_utc_timestamp() algolia_engine.sync(doc, add=False) finally: doc.download_status = Document.READY doc.save()
def process_gdrive_docs(requester, access_token, refresh_token, files_fn, json_key): service = connect_to_gdrive(access_token, refresh_token) folders = {} page_token = None new_start_page_token = None while True: files = files_fn(service, page_token) new_start_page_token = files.get('newStartPageToken', new_start_page_token) items = files.get(json_key, []) if not folders and len(items) > 0: # retrieve all folders to be able to get file path more easily in the file listing(s) logger.debug("Getting folders for %s/%s", requester.id, requester.username) folders = get_gdrive_folders(service) # check if any folder was marked as hidden and we already have it synced ... # if we do, then remove it (plus all children) from our indexing for folder_id, folder in folders.items(): if folder.get('hidden') is True: desync_folder(folder.get('id'), folders, requester, service) for item in items: if 'file' in item: item = item['file'] # check for ignored mime types if any(x.match(item.get('mimeType', '')) for x in IGNORED_MIMES): continue parents = item.get('parents', []) hidden = is_hidden(item.get('description')) or any(is_hidden_in_folder(f, folders) for f in parents) if item.get('trashed') or hidden: # file was removed or hidden Document.objects.filter( document_id=item['id'], requester=requester, user_id=requester.id ).delete() continue # handle file path within gdrive parent = parents[0] if parents else None path = get_gdrive_path(parent, folders) doc, created = get_or_create( model=Document, document_id=item['id'], requester=requester, user_id=requester.id ) doc.mime_type = item.get('mimeType').lower() doc.title = item.get('name') doc.webview_link = item.get('webViewLink') doc.icon_link = item.get('iconLink') doc.thumbnail_link = item.get('thumbnailLink') doc.last_updated = item.get('modifiedTime') doc.path = path last_modified_on_server = parse_date(doc.last_updated) doc.last_updated_ts = last_modified_on_server.timestamp() doc.modifier_display_name = item.get('lastModifyingUser', {}).get('displayName') doc.modifier_photo_link = item.get('lastModifyingUser', {}).get('photoLink') doc.owner_display_name = item['owners'][0]['displayName'] doc.owner_photo_link = item.get('owners', [{}])[0].get('photoLink') doc.primary_keywords = GDRIVE_KEYWORDS['primary'] doc.secondary_keywords = GDRIVE_KEYWORDS['secondary'][doc.mime_type] \ if doc.mime_type in GDRIVE_KEYWORDS['secondary'] else None can_download = item.get('capabilities', {}).get('canDownload', True) if can_download: # check also the mime type as we only support some of them if not any(x for x in EXPORTABLE_MIMES if doc.mime_type.startswith(x)): can_download = False if can_download: if not created: if doc.download_status is Document.READY and can_download and \ (doc.last_synced is None or last_modified_on_server > doc.last_synced): doc.download_status = Document.PENDING subtask(download_gdrive_document).delay(doc, access_token, refresh_token) else: algolia_engine.sync(doc, add=created) subtask(download_gdrive_document).delay(doc, access_token, refresh_token) else: doc.download_status = Document.READY doc.last_synced = get_utc_timestamp() doc.save() algolia_engine.sync(doc, add=False) doc.save() page_token = files.get('nextPageToken') if not page_token: break return new_start_page_token
def collect_issues(requester, sync_update=False): jira = init_jira_client(requester) for project in jira.projects(): project_name = project.raw.get('name') project_key = project.raw.get('key') project_url = '{}/projects/{}'.format(project._options.get('server'), project_key) logger.debug("Processing Jira project %s for user %s", project_key, requester.username) jql = 'project={}'.format(project_key) if sync_update: # only fetch those issues that were updated in the last day jql = "{} and updated > '-1d'".format(jql) jql = '{} order by key'.format(jql) i = 0 old_i = -1 while True: # manually page through results (using 'maxResults=None' should page automatically, but it doesn't work) if i == old_i: break old_i = i for issue in jira.search_issues(jql, startAt=i, maxResults=25, validate_query=False): i = i + 1 db_issue, created = Document.objects.get_or_create( jira_issue_key=issue.key, requester=requester, user_id=requester.id) logger.debug("Processing Jira issue %s for user %s", issue.key, requester.username) updated = issue.fields.updated or issue.fields.created or get_utc_timestamp( ) updated_ts = parse_dt(updated).timestamp() if not created and db_issue.last_updated_ts: # compare timestamps and skip the deal if it hasn't been updated if db_issue.last_updated_ts >= updated_ts: logger.debug("Issue '%s' for user '%s' hasn't changed", issue.key, requester.username) continue i = i + 1 db_issue.primary_keywords = JIRA_KEYWORDS['primary'] db_issue.secondary_keywords = JIRA_KEYWORDS['secondary'] db_issue.last_updated = updated db_issue.last_updated_ts = updated_ts db_issue.webview_link = '{}/browse/{}'.format( project._options.get('server'), issue.key) db_issue.jira_issue_title = '{}: {}'.format( issue.key, issue.fields.summary) db_issue.jira_issue_status = issue.fields.status.name db_issue.jira_issue_type = issue.fields.issuetype.name db_issue.jira_issue_priority = issue.fields.priority.name if issue.fields.description: db_issue.jira_issue_description = cut_utf_string( issue.fields.description, 9000, 100) db_issue.jira_issue_duedate = issue.fields.duedate db_issue.jira_issue_labels = issue.fields.labels db_issue.jira_issue_assignee = { 'name': issue.fields.assignee.displayName, 'avatar': issue.fields.assignee.raw.get('avatarUrls', {}) } if issue.fields.assignee else {} reporter = issue.fields.reporter or issue.fields.creator db_issue.jira_issue_reporter = { 'name': reporter.displayName, 'avatar': reporter.raw.get('avatarUrls', {}) } db_issue.jira_project_name = project_name db_issue.jira_project_key = project_key db_issue.jira_project_link = project_url db_issue.last_synced = get_utc_timestamp() db_issue.download_status = Document.READY db_issue.save() algolia_engine.sync(db_issue, add=created) time.sleep(2) # add sleep of five seconds to avoid breaking API rate limits time.sleep(5)
def process_customer(requester, db_customer, mailboxes, folders, users): helpscout_client = init_helpscout_client(requester) db_customer.download_status = Document.PROCESSING db_customer.save() last_conversation = {} conversation_emails = set() conversations = [] for box_id, box_name in mailboxes.items(): logger.debug( "Fetching Helpscout conversations for '%s' in mailbox '%s'", db_customer.helpscout_name, box_name) while True: box_conversations = helpscout_client.conversations_for_customer_by_mailbox( box_id, db_customer.helpscout_customer_id) if not box_conversations or box_conversations.count < 1: break for bc in box_conversations: conversation = { 'id': bc.id, 'number': '#{}'.format(bc.number), 'mailbox': box_name, 'mailbox_id': box_id, 'folder': folders.get(bc.folderid), 'status': bc.status, 'owner': format_person(bc.owner), 'customer': format_person(bc.customer), 'subject': bc.subject, 'tags': bc.tags } last_updated = next( (getattr(bc, x) for x in ['usermodifiedat', 'modifiedat', 'createdat'] if hasattr(bc, x)), None) conversation['last_updated'] = last_updated if last_updated: conversation['last_updated_ts'] = parse_dt( last_updated).timestamp() conversations.append(conversation) if bc.customer: conversation_emails = conversation_emails.union( bc.customer.get('emails') or []) if last_updated and \ conversation.get('last_updated_ts', 0) > last_conversation.get('last_updated_ts', 0): last_conversation = conversation # add sleep of three seconds to avoid breaking API rate limits time.sleep(3) helpscout_client.clearstate() if db_customer.last_updated_ts >= last_conversation.get( 'last_updated_ts', 0): logger.info( "Helpscout customer '%s' for user '%s' seems unchanged, skipping further processing", db_customer.helpscout_name, requester.username) db_customer.download_status = Document.READY db_customer.save() return db_customer.last_updated = last_conversation.get('last_updated') db_customer.last_updated_ts = last_conversation.get('last_updated_ts') db_customer.helpscout_mailbox = last_conversation.get('mailbox') db_customer.helpscout_mailbox_id = last_conversation.get('mailbox_id') db_customer.helpscout_folder = last_conversation.get('folder') db_customer.helpscout_status = last_conversation.get('status') db_customer.helpscout_assigned = last_conversation.get('owner') is not None if conversation_emails: db_customer.helpscout_emails = ', '.join( filter(None, conversation_emails)) # build helpscout content content = process_conversations(users, conversations, helpscout_client) db_customer.helpscout_content = content db_customer.download_status = Document.READY db_customer.last_synced = get_utc_timestamp() db_customer.save() algolia_engine.sync(db_customer, add=False)
def collect_boards(requester): trello_client = init_trello_client(requester) orgs = dict() for board in trello_client.list_boards(board_filter='open,closed'): db_board, created = Document.objects.get_or_create( trello_board_id=board.id, trello_card_id__isnull=True, requester=requester, user_id=requester.id ) board_last_activity = board.raw.get('dateLastActivity') if not board_last_activity: # this nasty hack is needed, becuse some Trello boards don't have 'dateLastActivity' timestamp # -> looks like it's those boards that have been inactive for some time if not created: board_last_activity = db_board.last_updated.isoformat() else: # Trello was established in 2011, so we use 01.01.2011 as epoch actions = board.fetch_actions(action_filter='all', action_limit=1, since='2011-01-01T00:00:00.000Z') if actions: board_last_activity = actions[0].get('date') last_activity = parse_dt(board_last_activity).isoformat() last_activity_ts = int(parse_dt(board_last_activity).timestamp()) if not created and db_board.download_status == Document.READY and \ (db_board.last_updated_ts and db_board.last_updated_ts >= last_activity_ts): logger.debug("Trello board '%s' for user '%s' hasn't changed", board.name[:50], requester.username) continue logger.debug("Processing board '%s' for user '%s'", board.name[:50], requester.username) db_board.primary_keywords = TRELLO_PRIMARY_KEYWORDS db_board.secondary_keywords = TRELLO_SECONDARY_KEYWORDS['board'] db_board.last_updated = last_activity db_board.last_updated_ts = last_activity_ts db_board.trello_title = 'Board: {}'.format(board.name) db_board.webview_link = board.url db_board._trello_description = board.description db_board.trello_board_status = 'Closed' if board.closed else 'Open' orgId = board.raw.get('idOrganization') if orgId and orgId not in orgs: try: org = trello_client.get_organization(orgId).raw orgs[orgId] = { 'name': org.get('displayName'), 'logo': 'https://trello-logos.s3.amazonaws.com/{}/30.png'.format(orgId), 'url': org.get('url') } except ResourceUnavailable: # defunct/deleted organization, assume that board is personal orgId = None db_board.trello_board_org = orgs[orgId] if orgId else None build_list = lambda l: { 'id': l.id, 'name': l.name, 'closed': l.closed, 'pos': l.pos } all_lists = {l.id: build_list(l) for l in board.all_lists()} db_board.trello_content = { 'description': _to_html(board.description), 'lists': sorted( filter(lambda x: not x.get('closed'), all_lists.values()), key=itemgetter('pos') ) } build_member = lambda m: { 'name': m.full_name, 'url': m.url, 'avatar': 'https://trello-avatars.s3.amazonaws.com/{}/30.png'.format(m.avatar_hash) } all_members = {m.id: build_member(m) for m in board.all_members()} db_board.trello_board_members = list(all_members.values()) db_board.last_synced = get_utc_timestamp() db_board.download_status = Document.READY db_board.save() algolia_engine.sync(db_board, add=created) subtask(collect_cards).delay(requester, db_board, board.name, all_members, all_lists) # add sleep of 30s to avoid breaking api limits time.sleep(30)
def collect_repos(requester): github_client = init_github_client(requester) # simple check if we are approaching api rate limits if github_client.rate_limiting[0] < 500: logger.debug( "Skipping github repos sync for user '%s' due to rate limits", requester.username) return i = 0 for repo in github_client.get_user().get_repos(): if not (repo.id or repo.full_name): logger.debug("Skipping github repo '%s' for user '%s'", repo.full_name, requester.username) # seems like broken data, skip it continue if repo.fork: # don't process forked repos logger.debug("Skipping forked github repo '%s' for user '%s'", repo.full_name, requester.username) continue db_repo, created = Document.objects.get_or_create( github_repo_id=repo.id, github_commit_id__isnull=True, github_file_id__isnull=True, github_issue_id__isnull=True, requester=requester, user_id=requester.id) db_repo.primary_keywords = GITHUB_PRIMARY_KEYWORDS db_repo.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['repo'] db_repo.github_title = 'Repo: {}'.format(repo.name) db_repo.github_repo_owner = repo.owner.login db_repo.github_repo_description = repo.description logger.debug("Processing github repo '%s' for user '%s'", repo.full_name, requester.username) commit_count = 0 contributors = [] try: # fetch contributors for cnt in repo.get_contributors(): commit_count = commit_count + cnt.contributions if len(contributors) <= 10: contributors.append({ 'name': cnt.name, 'url': cnt.html_url, 'avatar': cnt.avatar_url }) except UnknownObjectException: # most probably, this repo is disabled if created: logger.debug("Removing github repo '%s' for user '%s'", repo.full_name, requester.username) db_repo.delete() continue db_repo.github_repo_commit_count = commit_count db_repo.github_repo_contributors = contributors db_repo.github_repo_full_name = repo.full_name new_timestamp = max(repo.updated_at, repo.pushed_at) if created or new_timestamp.timestamp() > (db_repo.last_updated_ts or 0): i = i + 1 db_repo.last_updated_ts = new_timestamp.timestamp() db_repo.last_updated = new_timestamp.isoformat() + 'Z' db_repo.webview_link = repo.html_url # fetch readme file try: readme = repo.get_readme() readme_content = cut_utf_string(readme.decoded_content.decode( 'UTF-8', errors='replace'), 9000, step=100) md = github_client.render_markdown(text=readme_content).decode( 'UTF-8', errors='replace') # also replace <em> tags, because they are used by Algolia highlighting db_repo.github_repo_content = md.replace('<em>', '<b>').replace( '</em>', '</b>') db_repo.github_repo_readme = readme.name except UnknownObjectException: # readme does not exist db_repo.github_repo_content = None algolia_engine.sync(db_repo, add=created) if created: # sync files subtask(collect_files).delay(requester, repo.id, repo.full_name, repo.html_url, repo.default_branch, enrichment_delay=i * 300) # sync commits subtask(collect_commits).apply_async(args=[ requester, repo.id, repo.full_name, repo.html_url, repo.default_branch, commit_count ], countdown=240 * i if created else 1) # sync issues subtask(collect_issues).apply_async( args=[requester, repo.id, repo.full_name, created], countdown=180 * i if created else 1) db_repo.last_synced = get_utc_timestamp() db_repo.download_status = Document.READY db_repo.save()
def collect_commits(requester, repo_id, repo_name, repo_url, default_branch, commit_count): """ Sync repository commits - up to the last commit that we've already synced or max 200 recent commits (whichever comes first). This is possible to do, because Github api returns commits sorted by commit timestamp and that old commits don't change (at least should not in a normally run repository). """ max_commits = 200 was_synced = Document.objects.filter( user_id=requester.id, github_repo_id=repo_id, github_commit_id__isnull=False).count() >= min(commit_count, max_commits) github_client = init_github_client(requester, per_page=20 if was_synced else 100) # simple check if we are approaching api rate limits if github_client.rate_limiting[0] < 500: logger.debug( "Skipping github commits sync for user '%s' due to rate limits", requester.username) return i = 0 for cmt in github_client.get_repo(full_name_or_id=repo_name).get_commits(): if i >= max_commits: break i = i + 1 db_commit, created = get_or_create(model=Document, github_commit_id=cmt.sha, github_repo_id=repo_id, requester=requester, user_id=requester.id) if not created and was_synced: logger.debug( "Found already synced commit, skipping further commits syncing for user '%s' and repo '%s'", requester.username, repo_name) break logger.debug( "Processing github commit for user '%s' and repo '%s' with message: %s", requester.username, repo_name, cmt.commit.message[:30]) db_commit.primary_keywords = GITHUB_PRIMARY_KEYWORDS db_commit.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['commit'] db_commit.last_updated_ts = cmt.commit.committer.date.timestamp() db_commit.last_updated = cmt.commit.committer.date.isoformat() + 'Z' db_commit.webview_link = cmt.html_url db_commit.github_title = 'Commit: {}'.format(cmt.commit.message[:50]) db_commit.github_commit_content = cmt.commit.message db_commit.github_repo_full_name = repo_name db_commit.github_commit_committer = { 'name': cmt.commit.author.name, } if cmt.author: db_commit.github_commit_committer['url'] = cmt.author.html_url db_commit.github_commit_committer['avatar'] = cmt.author.avatar_url # get the changed/added/deleted files in this commit (up to 100 files) files = [] for f in cmt.files: files.append({ 'sha': f.sha, 'filename': f.filename, 'url': f.blob_url, 'additions': f.additions, 'deletions': f.deletions, 'action': f.status }) if len(files) >= 100: break if was_synced and len(files) > 0: subtask(enrich_files).delay(requester, files, repo_id, repo_name, repo_url, default_branch) db_commit.github_commit_files = files algolia_engine.sync(db_commit, add=created) db_commit.last_synced = get_utc_timestamp() db_commit.download_status = Document.READY db_commit.save() # add sleep of half a second to avoid breaking API rate limits time.sleep(0.5)
def enrich_files(requester, files, repo_id, repo_name, repo_url, default_branch): """ Fetch committers, update timestamp, etc. for files. """ github_client = init_github_client(requester, per_page=50) # simple check if we are approaching api rate limits if github_client.rate_limiting[0] < 500: # reschedule after 10 minutes logger.debug( "Skipping github enrich files for user '%s' due to rate limits", requester.username) subtask(enrich_files).apply_async(args=[ requester, files, repo_id, repo_name, repo_url, default_branch ], countdown=600) return repo = github_client.get_repo(full_name_or_id=repo_name) for f in files: db_file, created = Document.objects.get_or_create( github_file_id=_compute_sha('{}{}'.format(repo_id, f.get('filename'))), github_repo_id=repo_id, requester=requester, user_id=requester.id) if f.get('action') == 'removed': db_file.delete() continue logger.debug("Enriching github file '%s' for repo '%s' and user '%s'", f.get('filename'), repo_name, requester.username) db_file.primary_keywords = GITHUB_PRIMARY_KEYWORDS db_file.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['file'] db_file.github_title = '{}: {}'.format( 'Dir' if f.get('type') == 'tree' else 'File', f.get('filename').split('/')[-1]) db_file.github_file_path = f.get('filename') db_file.github_repo_full_name = repo_name db_file.webview_link = '{}/blob/{}/{}'.format(repo_url, default_branch, f.get('filename')) committers = [] seen = set() ts_set = False for cmt in repo.get_commits(sha=default_branch, path=f.get('filename')): if not ts_set: db_file.last_updated_ts = cmt.commit.committer.date.timestamp() db_file.last_updated = cmt.commit.committer.date.isoformat( ) + 'Z' ts_set = True if cmt.commit.committer.name not in seen: c = {'name': cmt.commit.committer.name} if cmt.committer: c['url'] = cmt.committer.html_url c['avatar'] = cmt.committer.avatar_url committers.append(c) seen.add(cmt.commit.committer.name) if len(committers) >= 10: break db_file.github_file_committers = committers algolia_engine.sync(db_file, add=created) db_file.last_synced = get_utc_timestamp() db_file.download_status = Document.READY db_file.save() # add sleep to avoid breaking API rate limits time.sleep(2)
def collect_issues(requester, repo_id, repo_name, created): """ Fetch the issues for a 'repo_name'. Note that Github API considers Pull Requests as issues. Therefore, when iterating through repo's issues, we get pull requests as well. At the moment, we also treat PRs as issues. TODO: handle pull requests properly (changed files, commits in this PR, possibly diffs ...) """ github_client = init_github_client(requester) # simple check if we are approaching api rate limits if github_client.rate_limiting[0] < 500: logger.debug( "Skipping github issues sync for user '%s' due to rate limits", requester.username) return repo = github_client.get_repo(full_name_or_id=repo_name) search_args = {'state': 'all', 'sort': 'updated'} if not created: # if we are processing already synced repo, then just look for newly updated issues search_args['since'] = datetime.now(timezone.utc) - timedelta(hours=6) i = 0 for issue in repo.get_issues(**search_args): db_issue, created = Document.objects.get_or_create( github_issue_id=issue.id, github_repo_id=repo_id, requester=requester, user_id=requester.id) if not created and db_issue.last_updated_ts and db_issue.last_updated_ts >= issue.updated_at.timestamp( ): continue logger.debug("Processing github issue #%s for user '%s' and repo '%s'", issue.number, requester.username, repo_name) db_issue.primary_keywords = GITHUB_PRIMARY_KEYWORDS db_issue.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['issue'] db_issue.last_updated_ts = issue.updated_at.timestamp() db_issue.last_updated = issue.updated_at.isoformat() + 'Z' db_issue.webview_link = issue.html_url db_issue.github_title = '#{}: {}'.format(issue.number, issue.title) if '/pull/' in issue.html_url: # pull request db_issue.github_title = 'PR {}'.format(db_issue.github_title) comments = [] if issue.comments > 0: for comment in issue.get_comments(): comments.append({ 'body': _to_html(comment.body), 'timestamp': comment.updated_at.timestamp(), 'author': { 'name': comment.user.login, 'avatar': comment.user.avatar_url, 'url': comment.user.html_url } }) # only list up to 20 comments if len(comments) >= 20: break content = {'body': _to_html(issue.body), 'comments': comments} # take care of Algolia 10k limit while len(json.dumps(content).encode('UTF-8')) > 9000: if len(content['comments']) < 1: content['body'] = cut_utf_string(content['body'], 9000, step=100) break content['comments'] = content['comments'][:-1] db_issue.github_issue_content = content db_issue.github_repo_full_name = repo_name db_issue.github_issue_state = issue.state db_issue.github_issue_labels = [x.name for x in issue.labels] db_issue.github_issue_reporter = { 'name': issue.user.login, 'avatar': issue.user.avatar_url, 'url': issue.user.html_url } db_issue.github_issue_assignees = [] for assignee in issue.assignees: db_issue.github_issue_assignees.append({ 'name': assignee.login, 'avatar': assignee.avatar_url, 'url': assignee.html_url }) algolia_engine.sync(db_issue, add=created) db_issue.last_synced = get_utc_timestamp() db_issue.download_status = Document.READY db_issue.save() # add sleep every 50 issues to avoid breaking API rate limits i = i + 1 if i % 50 == 0: time.sleep(20)