Exemple #1
0
 def test_basics(self):
     for url, output in TESTS:
         assert ensure_protocol(url) == output
     assert ensure_protocol('lemonde.fr?utm_hp_ref=test',
                            'ftp') == 'ftp://lemonde.fr?utm_hp_ref=test'
     assert ensure_protocol('https://lemonde.fr?utm_hp_ref=test',
                            'ftp') == 'https://lemonde.fr?utm_hp_ref=test'
     assert ensure_protocol('lemonde.fr?utm_hp_ref=test',
                            'http://') == 'http://lemonde.fr?utm_hp_ref=test'
def crowdtangle_posts_by_id_action(namespace, output_file):

    client = CrowdTangleClient(namespace.token, rate_limit=namespace.rate_limit)

    already_done = 0

    def listener(event, row):
        nonlocal already_done

        if event == 'resume.input':
            already_done += 1

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=CROWDTANGLE_POST_CSV_HEADERS,
        resumable=namespace.resume,
        listener=listener
    )

    loading_bar = tqdm(
        desc='Retrieving posts',
        dynamic_ncols=True,
        total=namespace.total,
        unit=' posts'
    )

    loading_bar.update(already_done)
    loading_bar_context = LoadingBarContext(loading_bar)

    try:
        for row, url in enricher.cells(namespace.column, with_rows=True):
            with loading_bar_context:
                url = url.strip()

                if not url:
                    enricher.writerow(row)
                    continue

                url = ensure_protocol(url)

                if not is_facebook_post_url(url):
                    enricher.writerow(row)
                    continue

                post_id = facebook.post_id_from_url(url)

                if post_id is None:
                    enricher.writerow(row)
                    continue

                post = client.post(post_id, format='csv_row')
                enricher.writerow(row, post)

    except CrowdTangleInvalidTokenError:
        die([
            'Your API token is invalid.',
            'Check that you indicated a valid one using the `--token` argument.'
        ])
Exemple #3
0
def domain_from_url(url):
    # print('   -> DOMAIN FROM URL:', ensure_protocol(url)[:30], type(url))
    parsed_url = urlparse(ensure_protocol(url))
    result = parsed_url.netloc.split('@')
    if len(result) == 1:
        return result[0]
    else:
        return result[1]
Exemple #4
0
    def create(self, request):

        #checks if url field is blank
        if request.data.get('url') == None:
            return Response({"message": "URL field cannot be blank"}, status=status.HTTP_400_BAD_REQUEST)

        #proccesses a malformed url,rejects values that do not form valid url
        schemed_url = ensure_protocol(
            request.data.get('url'), protocol='https')
        if is_url(schemed_url) == False:
            return Response({"message": "Enter a valid url"}, status=status.HTTP_400_BAD_REQUEST)

        #checks if name input contains only letters,numbers,underscore and hyphen
        pattern = "^[A-Za-z0-9_-]*$"
        name=request.data.get('name')
        if name is not None and bool(re.match(pattern, name)) == False:
            return Response({"message": "Name can contain only letters,numbers,underscore and hyphen"}, status=status.HTTP_400_BAD_REQUEST)

        # maps request data to serializer class to get an object
        serializer = serializers.MemeSerializer(data={"name": request.data.get(
            'name'), "url": schemed_url, "caption": request.data.get("caption")})

        # checks validity of the serializer object whether all required fields are present
        if serializer.is_valid():

            # extract the various parameters sent in request data
            creator = serializer.data.get('name')
            caption = serializer.data.get('caption')
            url = serializer.data.get('url')

            # set creationDateTime,creationDate,lastUpdate as current date and time
            creationDateTime = timezone.now()
            creationDate = date.today()
            updatedDateTime = timezone.now()

            # create a meme object with data extracted
            obj = Meme(caption=caption, url=schemed_url, name=creator, creationDateTime=creationDateTime,
                       creationDate=creationDate, lastUpdate=updatedDateTime)

            # check if meme object  already exists
            query_obj2 = Meme.objects.filter(url=schemed_url).filter(
                name=creator).filter(caption=caption)
            if len(query_obj2) >= 1:
                return Response({'message': 'This meme already exists'}, status=status.HTTP_409_CONFLICT)

            # if meme object does not exit create a new meme by saving it to database
            obj.save()
            # get the id of the meme object created
            postCounter = obj.id
            # return the id of the meme object created with accepted status code
            return Response({'id': str(postCounter)}, status=status.HTTP_201_CREATED)
        else:
            """if any required data was missing or if serializer object could
            not be created,return the exact serialization error that occured
            with bad request status code"""
            return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
Exemple #5
0
    def payloads():
        for item in iterator:
            url = item if key is None else key(item)

            if not url:
                yield FetchWorkerPayload(http=http, item=item, url=None)

                continue

            # Url cleanup
            url = ensure_protocol(url.strip())

            yield FetchWorkerPayload(http=http, item=item, url=url)
Exemple #6
0
def payloads_iter(iterator, key=None):
    for item in iterator:
        url = item if key is None else key(item)

        if not url:
            yield FetchWorkerPayload(
                item=item,
                domain=None,
                url=None
            )

            continue

        # Url cleanup
        url = ensure_protocol(url.strip())

        yield FetchWorkerPayload(
            item=item,
            domain=get_domain_name(url),
            url=url
        )
Exemple #7
0
    def partial_update(self, request, pk=None):
        #if id input is anything other than positive integer,return status bad request
        try:
            val = int(pk)
        except ValueError:
            return Response({"message": "Enter positive number"}, status=status.HTTP_400_BAD_REQUEST)
        
        #if id input is negative integer,return status bad request
        if int(pk) < 0:
            return Response({"message": "Enter positive number"}, status=status.HTTP_400_BAD_REQUEST)
        # get meme object by its id
        queryset = Meme.objects.filter(id=pk)

        # check if meme object exists,if it does not return http status not found
        if len(queryset) == 0:
            return Response(status=status.HTTP_404_NOT_FOUND)

        if request.data.get('name') != None:
            return Response({"message": "Creator name cannot be changed!!"}, status=status.HTTP_400_BAD_REQUEST)

        if request.data.get('url') == None and request.data.get('caption') == None:
            return Response({"message": "Both url and caption cannot be none"}, status=status.HTTP_400_BAD_REQUEST)

        url = request.data.get('url')
        caption = request.data.get('caption')

        # if only caption is supplied,only caption is updated
        if url == None and caption != None:
            obj = queryset[0]
            obj.caption = caption
            obj.lastUpdate = timezone.now()
            obj.save()
            # return response no content if successfully updated
            return Response(status=status.HTTP_204_NO_CONTENT)

        # if url entered has no scheme,add scheme to url and check if a valid url is formed
        schemed_url = ensure_protocol(
            request.data.get('url'), protocol='https')
        if is_url(schemed_url) == False:
            return Response({"message": "Enter a valid url"}, status=status.HTTP_400_BAD_REQUEST)

        # if meme with that id exists,map request data to serializer to extract attributes based on data supplied
        # ie either url or caption or both
        if url != None and caption != None:
            serializer = serializers.MemeUpdateSerializer(
                data={"url": schemed_url, "caption": request.data.get("caption")}, partial=True)
        else:
            serializer = serializers.MemeUpdateSerializer(
                data={"url": schemed_url}, partial=True)

        # check if serializer object is valid i.e all required fields are present and no extra fields are present
        if serializer.is_valid():
            obj = queryset[0]

            # extract the caption and url of the meme object
            oldCaption = obj.caption
            oldUrl = obj.url

            # set caption and url sent as request to new caption and new url
            newCaption = serializer.data.get('caption')
            newUrl = serializer.data.get('url')

            # check if new caption is not same as existing caption.If not then update caption field of meme object.
            if newCaption is not None and newCaption != oldCaption:
                obj.caption = newCaption
            # check if new url is not same as existing url.If not then update url field of meme object.
            if newUrl is not None and newUrl != oldUrl:
                obj.url = newUrl
            # if any of the fields were updated,set lastUpdate field of the object to current date and time
            if newUrl != oldUrl or newCaption != oldCaption:
                obj.lastUpdate = timezone.now()

            # save the meme object
            obj.save()
            # return response no content if successfully updated
            return Response(status=status.HTTP_204_NO_CONTENT)
        else:
            return Response(serializer.errors, status=status.HTTP_404_NOT_FOUND)