Esempio n. 1
0
def refresh_unpublished_videos():
    session = db_session()
    a_month_ago = dt.datetime.utcnow() - dt.timedelta(days=30)
    unpublished_videos = \
        session.query(Video).filter(
            Video.channel_id.isnot(None),
            Video.published_at.is_(None),
            Video.uploaded_at > a_month_ago,
        )

    trusted_block_num = confirmed_block_num(5)

    def is_published(video_id):
        addr = get_publisher_address(video_id, trusted_block_num)
        return addr, video_id

    publish_results = lkeep(
        thread_multi(
            fn=is_published,
            fn_args=[None],
            dep_args=[video.id for video in unpublished_videos],
            max_workers=10,
            re_raise_errors=False,
        ))

    for publisher_addr, video_id in publish_results:
        if publisher_addr != null_address:
            video = session.query(Video).filter_by(id=video_id).one()
            video.published_at = dt.datetime.utcnow()
            video.eth_address = publisher_addr
            session.add(video)

    session.commit()
Esempio n. 2
0
def identify_placement(tiles: Dict[int, HashTile]) -> Placement:
    # figure out the placement and orientation of each tile in the final image

    # fix one corner tile in its original orientation
    start = corners(tiles)[0]

    # the position and orientation of each tile in the picture
    img_positions = {start: (0, 0)}
    img_orients = {start: 0}

    # and initialize the fringe with its non-none neighbouring tiles
    unplaced = set(lkeep(tiles[start].neighbours))

    # assembly all the tiles
    while len(unplaced) > 0:
        # pop any tile from the fringe (and also grab its neighbouring tiles)
        candidate = unplaced.pop()
        neighbours = set(lkeep(tiles[candidate].neighbours))

        # identify any of the already fixed tiles
        fixed = (neighbours & img_positions.keys()).pop()

        # figure out the placement of the candidate tile
        fixed_tile = orient(tiles[fixed].tile, k=img_orients[fixed])
        placement = identify_tile_placement(fixed_tile, tiles[candidate].tile)

        # place the candidate relative to the fixed tile
        x, y = img_positions[fixed]
        img_positions[candidate] = (x + placement.dx, y + placement.dy)
        img_orients[candidate] = placement.orient

        # add the candidate's neighbours for the next iterations
        new_neighbours = neighbours - img_positions.keys()
        unplaced.update(new_neighbours)

    return Placement(positions=img_positions, orients=img_orients)
Esempio n. 3
0
def delete_video_files(video_id: str, file_mapper_obj: dict):
    # delete upload files
    s3 = boto3.client(
        's3',
        region_name=S3_UPLOADS_REGION,
        aws_access_key_id=AWS_MANAGER_PUBLIC_KEY,
        aws_secret_access_key=AWS_MANAGER_PRIVATE_KEY,
    )
    keys_to_delete = lkeep([
        file_mapper_obj['s3_upload_video_key'],
        file_mapper_obj['s3_upload_thumbnail_key'],
    ])
    s3.delete_objects(
        Bucket=file_mapper_obj['s3_upload_bucket'],
        Delete={
            'Objects': [{
                'Key': x
            } for x in keys_to_delete],
        },
    )

    # delete video files
    s3t = S3Transfer(
        region_name=S3_VIDEOS_REGION,
        bucket_name=S3_VIDEOS_BUCKET,
    )
    keys_to_delete = [
        *s3t.ls(f'snapshots/{video_id}'),
        *s3t.ls(f'thumbnails/{video_id}'),
        *s3t.ls(f'v1/{video_id}'),
    ]
    if keys_to_delete:
        s3t.client.delete_objects(
            Bucket=S3_VIDEOS_BUCKET,
            Delete={
                'Objects': [{
                    'Key': x['Key']
                } for x in keys_to_delete],
            },
        )
Esempio n. 4
0
def scrape_comments(mongo, batch_size=250, max_workers=50):
    """ Parse operations and post-process for comment/post extraction. """
    indexer = Indexer(mongo)
    start_block = indexer.get_checkpoint('comments')

    query = {
        "type": "comment",
        "block_num": {
            "$gt": start_block,
            "$lte": start_block + batch_size,
        }
    }
    projection = {
        '_id': 0,
        'block_num': 1,
        'author': 1,
        'permlink': 1,
    }
    results = list(mongo.Operations.find(query, projection=projection))
    identifiers = set(f"{x['author']}/{x['permlink']}" for x in results)

    # handle an edge case when we are too close to the head,
    # and the batch contains no work to do
    if not results and is_recent(start_block, days=1):
        return

    # get Post.export() results in parallel
    raw_comments = thread_multi(fn=get_comment,
                                fn_args=[None],
                                dep_args=list(identifiers),
                                max_workers=max_workers,
                                yield_results=True)
    raw_comments = lkeep(raw_comments)

    # split into root posts and comments
    posts = lfilter(lambda x: x['depth'] == 0, raw_comments)
    comments = lfilter(lambda x: x['depth'] > 0, raw_comments)

    # Mongo upsert many
    log_output = ''
    if posts:
        r = mongo.Posts.bulk_write(
            [
                UpdateOne({'identifier': x['identifier']},
                          {'$set': {
                              **x, 'updatedAt': dt.datetime.utcnow()
                          }},
                          upsert=True) for x in posts
            ],
            ordered=False,
        )
        log_output += \
            f'(Posts: {r.upserted_count} upserted, {r.modified_count} modified) '
    if comments:
        r = mongo.Comments.bulk_write(
            [
                UpdateOne({'identifier': x['identifier']},
                          {'$set': {
                              **x, 'updatedAt': dt.datetime.utcnow()
                          }},
                          upsert=True) for x in comments
            ],
            ordered=False,
        )
        log_output += \
            f'(Comments: {r.upserted_count} upserted, {r.modified_count} modified) '

    # We are only querying {type: 'comment'} blocks and sometimes
    # the gaps are larger than the batch_size.
    index = silent(max)(lpluck('block_num',
                               results)) or (start_block + batch_size)
    indexer.set_checkpoint('comments', index)

    log.info(f'Checkpoint: {index} {log_output}')
Esempio n. 5
0
def fill_probes(platform_id):
    platform = Platform.objects.get(pk=platform_id)
    gpl_name = platform.gpl_name
    cprint('%s %s %s' % (platform.pk, platform.gpl_name, platform.specie),
           attrs=['bold'])
    assert platform.specie

    platform.verdict = ''
    platform.probes_total = None
    platform.probes_matched = None
    platform.stats = {}
    platform.last_filled = timezone.now()

    annot_file = '/pub/geo/DATA/annotation/platforms/%s.annot.gz' % gpl_name
    family_file = '/pub/geo/DATA/SOFT/by_platform/%s/%s_family.soft.gz' % (
        gpl_name, gpl_name)
    files = [annot_file, family_file]
    tables = list(map(peek_platform, files))
    # Skip empty
    files = list(compress(files, tables))
    tables = lkeep(tables)

    # TODO: check other supplementary files formats
    supplementary_dir = '/pub/geo/DATA/supplementary/platforms/%s/' % gpl_name
    _, supplementary_files = listdir(supplementary_dir)
    supplementary_files = [
        f for f in supplementary_files
        if f.endswith('.txt.gz') and not re_test('\.cdf\.', f, re.I)
    ]
    files.extend(supplementary_files)
    tables.extend(
        decompress(download('%s%s' % (supplementary_dir, f)))
        for f in supplementary_files)
    platform.stats['files'] = lkeep(files)

    if not any(tables):
        cprint('No data for %s' % gpl_name, 'red')
        platform.verdict = 'no data'
        platform.save()
        return

    # Read tables in
    df = pd.concat(
        read_table(table, file) for table, file in zip(tables, files))
    del tables  # free memory
    platform.probes_total = len(set(df.index))
    cprint('Found %d probes to match' % platform.probes_total, 'yellow')
    # import ipdb; ipdb.set_trace()  # noqa

    # Try to resolve probes starting from best scopes
    mygene_probes = []
    platform.stats['matches'] = []
    platform.verdict = 'no clue'
    for scopes, cols in SCOPE_COLUMNS:
        cols = list(set(cols) & set(df.columns))
        if not cols:
            continue
        cprint('> Looking into %s' % ', '.join(sorted(cols)), 'cyan')
        platform.verdict = 'nothing matched'

        probes = pd.concat(df[col].dropna() for col in cols)
        new_matches = mygene_fetch(platform, probes, scopes)
        mygene_probes.extend(new_matches)

        # Drop matched probes
        if new_matches:
            platform.stats['matches'].append({
                'scopes': scopes,
                'cols': cols,
                'found': len(new_matches),
            })

            df = df.drop(lpluck('probe', new_matches))
            if df.empty:
                break

    # Update stats and history
    platform.probes_matched = len(mygene_probes)
    platform.history.append({
        'time': timezone.now().strftime('%Y-%m-%d %T'),
        'probes_total': platform.probes_total,
        'probes_matched': platform.probes_matched,
    })

    # Insert found genes
    if mygene_probes:
        with transaction.atomic():
            platform.verdict = 'ok'
            platform.save()

            platform.probes.all().delete()
            PlatformProbe.objects.bulk_create([
                PlatformProbe(platform=platform, **probe_info)
                for probe_info in mygene_probes
            ])
        cprint('Inserted %d probes for %s' % (len(mygene_probes), gpl_name),
               'green')
    else:
        cprint('Nothing matched for %s' % gpl_name, 'red')
        platform.save()
Esempio n. 6
0
def search(request):
    # Save last specie in session
    specie = request.GET.get('specie')
    if specie != request.session.get('specie'):
        request.session['specie'] = specie

    q = request.GET.get('q')
    if not q:
        return {'series': None}

    exclude_tags = lkeep(silent(int), request.GET.getlist('exclude_tags'))
    series_tags, tag_series, tag_ids = series_tags_data()

    # Parse query
    q_string, q_tags = _parse_query(q)
    q_tags, wrong_tags = lsplit(lambda t: t.lower() in tag_ids, q_tags)
    if wrong_tags:
        message = 'Unknown tag%s %s.' % ('s' if len(wrong_tags) > 1 else '',
                                         ', '.join(wrong_tags))
        messages.warning(request, message)
    if not q_string and not q_tags:
        return {'series': None}

    # Build qs
    qs = search_series_qs(q_string)
    if specie:
        qs = qs.filter(specie=specie)

    if q_tags:
        q_tag_ids = lkeep(tag_ids.get(t.lower()) for t in q_tags)
        include_series = reduce(set.intersection,
                                (tag_series[t] for t in q_tag_ids))
        if include_series:
            qs = qs.filter(id__in=include_series)
        else:
            message = 'No series annotated with %s.' \
                % (q_tags[0] if len(q_tags) == 1 else 'all these tags simultaneously')
            messages.warning(request, message)
            return {'series': []}

    series_ids = qs.values_list('id', flat=True).order_by()
    tags = ldistinct(mapcat(series_tags, series_ids), key=itemgetter('id'))

    if exclude_tags:
        exclude_series = join(tag_series[t] for t in exclude_tags)
        qs = qs.exclude(id__in=exclude_series)

    series = paginate(request, qs, 10)

    # Get annotations statuses
    annos_qs = SeriesAnnotation.objects.filter(series__in=series) \
                               .values_list('series_id', 'tag_id', 'best_cohens_kappa')
    tags_validated = {(s, t): k == 1 for s, t, k in annos_qs}

    return dict(
        {
            'series': series,
            'page': series,
            'tags_validated': tags_validated,
            'tags': tags,
            'series_tags': series_tags,
        }, **_search_stats(qs))