def getdata(page):
    """
    Scrapes the data from the page taken as parameter
    :param page: page to be scraped
    :return: scraped data
    """
    datatable = page.find('body/div/table/tbody')
    data = {}
    # the verification is represented by an image of a green tick or a red cross in the site
    verified = {'/Resources/green_tick.png': True, '/Resources/red_cross.png': False}

    keys = ['Tag', 'Link', 'Verified']
    for tr in datatable:
        address = ""
        dataaddress = {}
        for k, td in enumerate(tr):
            content = (list(td)[0])
            if k == 0:
                address = content.text
                addresstype = findalladdresses(address)
                if addresstype:
                    dataaddress["Type"] = addresstype.pop()[0]
            else:
                if content.text:
                    dataaddress[keys[k-1]] = content.text
                if content.get('src'):
                    dataaddress[keys[k-1]] = verified[content.get('src')]
        data[address] = dataaddress
    return data
Beispiel #2
0
def getfeatureaddresses(feature):
    """
    Get the addresses from a dictionary of features
    :param feature: dict of features
    :return: a dict of sets containing the addresses
    """
    return scraputils.tupleset_to_dict_of_sets(
        findalladdresses(' '.join(feature.values())))
def getcommentsdata(page):
    """
    Scrapes data from the comments of the article in page
    :param page: html of the page to be scraped
    :return: scraped data
    """
    data = []

    commentlist = page.find(
        'body/div/div[@id="main"]/div[@id="primary"]/div/div[@id="comments"]/ol'
    )

    for comment in commentlist:
        if isinstance(comment.tag, str):  # Avoid to iterate on html comments
            commentdata = {}
            author = comment.find(
                'article/footer/div[@class="comment-author vcard"]/b')
            if author.text:
                commentdata["Name"] = author.text
            else:
                a_author = author.getchildren()[0]
                commentdata["Name"] = a_author.text
                authorurl = a_author.get("href")

                if type(
                        validators.url(authorurl)
                ) != validators.utils.ValidationFailure:  # if it is a valid url
                    commentdata["Url"] = authorurl

            content = comment.find('article/div')

            # Retrieve addresses from plain html (so find addresses in both attributes and text)
            addrs = findalladdresses(
                etree.tostring(content, encoding=str, method="html"))

            if addrs.__len__() != 0:
                # Retrieve only the text without tags
                contenttext = " ".join(
                    etree.tostring(content, encoding=str,
                                   method="text").split())

                commentdata["Comment"] = contenttext

                commentdata["Source"] = "BitcoinWhosWho: Comments"

                for addr in addrs:
                    if addr[0] in commentdata:
                        commentdata[addr[0]].append(addr[1])
                    else:
                        commentdata[addr[0]] = [addr[1]]

                data.append(commentdata)

    return data
Beispiel #4
0
def datatojson(page, jsonfile, users, u):
    """
    Dumps data as a json file if data has valid addresses and substitute data of previous user
    if no significant data has been extracted
    :param page: page to be scraped
    :param jsonfile: path of json
    :param users: dict containing all data of the json and updated for each new found data
    :param u: current user id
    :return:
    """

    # This function permits to restart the program from the last checked user and not last user with some data
    def addlastcheckeduser(data):
        if users:
            if str(u - 1) == users[-1]["BitcoinTalkID"]:
                if "Name" not in users[-1]:
                    del users[-1]
        data.update({"BitcoinTalkID": str(u), "Source": "BitcoinTalk"})
        users.append(data)
        jsonfile.seek(0, 0)
        dump(users, jsonfile, indent=4)

    if not isemptypage(page):
        result = getfeatures(page)
        # join concatenate all strings of the values of the dictionary 'result'
        addresses = scraputils.tupleset_to_dict_of_sets(
            findalladdresses(' '.join(result.values())))
        # Users with no addresses are not useful
        if result and addresses:
            result.update(addresses)
            # print(result)
            addlastcheckeduser(result)
        else:
            addlastcheckeduser({})
    else:
        addlastcheckeduser({})
def process_data(filename, values, **kwargs):
    """
    Process data returned by the GUI
    :param filename: filename as the output of data
    :param values: values of the dict returned by GUI
    :param kwargs:
            cbox_order: array containing the sort order of fields
            addrs_keys: array containing the fields of addresses
            group_addr: array containing the fields of addresses by which data are grouped
            preserve_null; a boolean to preserve Null or Empty arrays or not
    :return:
    """
    cbox_order = kwargs.get('cbox_order')
    addrs_keys = kwargs.get('addrs_keys')
    group_addr = kwargs.get('group_addr')
    preserve_null = kwargs.get('preserve_null')

    query = copy.deepcopy(queries.queries)

    if values['Tab'] == 'Group by':
        # It is called pipeline because of the aggregation framework
        pipeline = query[0]
        unwind_dict = []

        if values[0] != 'Address':
            field_group = "Name" if values[0] == 'User' else 'Source'
            pipeline[0]["$group"]["_id"][values[0]] = "$data." + field_group
            pipeline[1]["$project"][values[0]] = "$_id." + values[0]

        for a in cbox_order:
            if a == 'users':
                field = "Name"
            elif a == 'sources':
                field = "Source"
            else:
                field = group_addr[addrs_keys.index(a)]
                unwind_dict.append({
                    "$unwind": {
                        "path": "$data." + group_addr[addrs_keys.index(a)],
                        "preserveNullAndEmptyArrays": preserve_null
                    }
                })

            pipeline[0]["$group"][a] = {"$addToSet": "$data." + field}
            pipeline[1]["$project"][a] = 1
            pipeline[1]["$project"]["size " + a] = {"$size": "$" + a}
            pipeline[1]["$project"]["or"]["$or"].append({"$size": "$" + a})
            pipeline[2]["$sort"]["size " + a] = -1
            if not values["size " + a]:
                pipeline[4]["$project"]["size " + a] = 0

        if values[0] == 'Address':
            for a in group_addr:
                if values[a]:  # if equals to True
                    pipeline[0]["$group"]["_id"][a] = "$data." + a
                    pipeline[1]["$project"][a] = "$_id." + a
                    unwind_dict.append({"$unwind": {"path": "$data." + a}})

        for u in unwind_dict:
            pipeline.insert(0, u)

        execute_query('Aggregation', pipeline, filename)

    if values['Tab'] == 'Find':
        query = {}
        if values['InputUser'].strip():
            query['data.Name'] = values['InputUser'].strip()

        if values['InputAddr'].strip():
            result = findalladdresses(values['InputAddr'].strip())
            if result:
                query["$or"] = []
                for r in result:
                    query["$or"].append({"data." + r[0]: r[1]})
            else:
                sg.PopupError('Address not valid')
                return

        if values['InputSource'].strip():
            query['data.Source'] = values['InputSource'].strip()

        if values['InputCustom'].strip():
            inputs = values['InputCustom'].split('$$')
            for inp in inputs:
                vals = inp.split('%%')
                vals = [x.strip() for x in vals]
                in_check = [vals[1]]

                if (vals[1][0] in ['+', '-']
                        and vals[1][1:].isdigit()) or vals[1].isdigit():
                    in_check.append(int(vals[1]))

                if vals[1] == 'True' or vals[1] == 'true':
                    in_check.append(True)

                query['data.' + vals[0]] = {"$in": in_check}

        execute_query('Find', query, filename)