コード例 #1
0
ファイル: get_texts.py プロジェクト: clarkwkw/GEStatProj
	def parse(self, response):
		try:
			json_response = utils.simple_check(response)
		except Exception as e:
			logging.error("request_failed: %s"%e.message)
			raise scrapy.exceptions.CloseSpider("request_failed: %s"%e.message)
		result = {}
		result["gid"] = json_response["content"]["id"]
		result["section"] = json_response["content"]["sectionId"]
		result["headline"] = utils.remove_html(json_response["content"]["fields"]["headline"])
		result["text"] = utils.remove_html(json_response["content"]["fields"]["body"])
		result["wordcount"] = json_response["content"]["fields"]["wordcount"]

		yield result
コード例 #2
0
    def parse(self, response):
        try:
            json_response = utils.simple_check(response)
        except Exception as e:
            logging.error("request_failed: %s" % e.message)
            raise scrapy.exceptions.CloseSpider("request_failed: %s" %
                                                e.message)
        result = {}
        result["gid"] = json_response["content"]["id"]
        result["section"] = json_response["content"]["sectionId"]
        result["headline"] = utils.remove_html(
            json_response["content"]["fields"]["headline"])
        result["text"] = utils.remove_html(
            json_response["content"]["fields"]["body"])
        result["wordcount"] = json_response["content"]["fields"]["wordcount"]

        yield result
コード例 #3
0
    def pipeline(self, text):
        text = utils.remove_space(text)
        text = utils.remove_punct(text)
        text = utils.remove_contractions(text.lower(), contractions)
        text = utils.remove_url(text)
        text = utils.remove_html(text)
        text = utils.correct_spellings(text)

        return text
コード例 #4
0
ファイル: database.py プロジェクト: emilyUsesPascal/ReplDex
async def fix_entry(data):
    if data is None: return
    original_data = dict(data)
    if data.get('image') and isinstance(data['image'], str):
        data['image'] = data['image'].replace('imag.cf', 'i.matdoes.dev')
    if data.get('image') and isinstance(data.get('image'), str):
        data['image'] = await images.get_data(data['image'])
    elif data.get('image') and not data['image'].get('thumbnail_b64'):
        data['image'] = await images.get_data(data['image']['src'])
    if data != original_data:
        # print('updated', data['_id'])
        await entries_coll.update_one({'_id': data['_id']}, {'$set': data})
    data['content'] = utils.fix_html(data['content'])
    if 'nohtml_content' not in data:
        data['nohtml_content'] = utils.remove_html(data['content'])
    return data
コード例 #5
0
async def edit_entry(title, content, editor=None, unlisted=False, entry_id=None, image=None):
	t = datetime.now()
	title = title.strip()
	content = utils.fix_html(content)
	nohtml_content = utils.remove_html(content)
	new_data = {
		'title': title,
		'content': content,
		'last_edited': t,
		'nohtml_content': nohtml_content
	}
	if unlisted is not None:
		new_data['unlisted'] = unlisted
	if image is not None:
		new_data['image'] = {
			'src': image
		}

	if not entry_id:
		entry_id = str(uuid.uuid4())
	new_history_data = {
		'author': editor,
		'content': content,
		'title': title,
		'time': t,
		'unlisted': unlisted,
	}
	if image is not None:
		new_history_data['image'] = {
			'src': image
		}
	await entries_coll.update_one(
		{'_id': entry_id},
		{
			'$set': new_data,
			'$push': {
				'history': new_history_data
			}
		},
		upsert=True
	)
	return entry_id
コード例 #6
0
async def create_response(entry_data, preview=False):
    if entry_data:
        entry_id = entry_data['_id']
        title = entry_data['title']
        content = entry_data.get('content', '[no content]')
        unlisted = entry_data.get('unlisted', False)
        image = entry_data.get('image')
        markdown = entry_data.get('markdown')
        no_html = entry_data.get('nohtml_content')
        content = await utils.before_show_text(content)
        markdown = utils.html_to_markdown(content)
        owner_id = entry_data.get('owner_id')
    else:
        return web.HTTPNotFound()

    url_title = utils.url_title(title)

    if preview:
        return {
            'title': title,
            'preview': utils.remove_html(content),
            'html': content,
            'id': entry_id,
            'image': image
        }
    else:
        return {
            'slug': url_title,
            'id': entry_id,
            'title': title,
            'html': content,
            'unlisted': unlisted,
            'image': image,
            'markdown': markdown,
            'no_html': no_html,
            'owner_id': owner_id,
        }
コード例 #7
0
ファイル: app.py プロジェクト: rajit906/Recommender-System
def get_recos():
    if request.method == "GET":
        #obtain user input
        user_desc = request.args['user_desc']
        num_recs = int(request.args['n'])

    elif request.method == 'POST':
        user_desc = request.form['user_desc']
        num_recs = int(request.form['n'])

    sim_prod = get_similar_products(compute_user_input_embedding(preproc_user_input(user_desc, model), model),catalog_embeddings, num_recs)    
    id_list = map(lambda tup: tup[0], sim_prod)
    recos = serve_recos(id_list, catalog)
    cleaned_recos = [remove_html(reco) for reco in recos]
    results = []
    for i in range(len(cleaned_recos)):
        d = {
            'rank': i + 1,
            'prod_desc': cleaned_recos[i]
        }

        results.append(d)

    return jsonify(results)
コード例 #8
0
    def csv_export(self, request, qs=None, raw_headers=None):
        response = HttpResponse(mimetype='text/csv')
        response['Content-Disposition'] = 'attachment; filename=%s.csv' \
            % slugify(self.model.__name__)
        
        if raw_headers is None:
            raw_headers = self.get_csv_raw_headers(request)
            
        def get_attr(obj, name, as_name=False):
            """
            Dereferences "__" delimited variable names.
            """
            parts = name.split('__')
            cursor = obj
            for part in parts:
                name = part
                cursor = getattr(cursor, part, None)
                if callable(cursor):
                    cursor = cursor()
            if cursor == obj:
                return
            if as_name:
                return name
            return cursor
        
        # Write header.
        header_data = {}
        fieldnames = []
        header_names = self.get_csv_header_names(request)
        
        # Write records.
        first = True
        qs = self.get_csv_queryset(request, qs)
        for r in qs[:self.csv_record_limit]:
            
            if first:
                first = False
                if not raw_headers:
                    if self.csv_headers_all and isinstance(r, dict):
                        if isinstance(qs, utils.DictCursor):
                            raw_headers = qs.field_order
                        else:
                            raw_headers = r.keys()
                    else:
                        raise Exception('No headers specified.')
                for name in raw_headers:
                    if name in header_names:
                        name_key = name
                        header_data[name] = header_names.get(name_key)
                    elif callable(name):
                        # This is likely a Formatter instance.
                        name_key = name.name
                        header_data[name_key] = name.short_description
                    elif isinstance(name, (tuple, list)) and len(name) == 2:
                        name_key, name_key_verbose = name
                        header_data[name_key] = name_key_verbose
                    elif isinstance(name, basestring) and hasattr(self, name):
                        # This is likely a ModelAdmin method name.
                        name_key = name
                        header_data[name_key] = getattr(self, name).short_description
                    elif hasattr(name, 'short_description'):
                        name_key = name
                        header_data[name_key] = getattr(
                            name, 'short_description')
                    elif hasattr(self.model, name):
                        name_key = name
                        if hasattr(getattr(self.model, name), 'short_description'):
                            header_data[name_key] = getattr(getattr(self.model, name), 'short_description')
                        else:
                            header_data[name_key] = name
                    else:
                        name_key = name
                        header_data[name_key] = name_key#get_attr(r, name, as_name=True)
#                        field = self.model._meta.get_field_by_name(name)
#                        if field and field[0].verbose_name:
#                            header_data[name_key] = field[0].verbose_name
#                        else:
#                            header_data[name_key] = name
                    header_data[name_key] = header_data[name_key].title()
                    fieldnames.append(name_key)
                
                writer = csv.DictWriter(
                    response,
                    fieldnames=fieldnames,
                    quoting=self.csv_quoting)
                writer.writerow(header_data)
            #print('fieldnames:',fieldnames
            data = {}
            for name in raw_headers:
                obj = None
                if isinstance(r, dict):
                    if name in r:
                        data[name] = r[name]
    #                    print('skipping:',name
                        continue
#                    elif 'id' in r:
#                        obj = self.model.objects.get(id=r['id'])
                
#                print('model:',self.model
#                print('r:',r
#                print('name:',name,isinstance(name, basestring) and hasattr(r, name)
                if callable(name):
                    # This is likely a Formatter instance.
                    name_key = name.name
                    if hasattr(name, 'plaintext'):
                        data[name_key] = to_ascii(name(r, plaintext=True))
                    else:
                        data[name_key] = to_ascii(name(r))
                elif isinstance(name, (tuple, list)) and len(name) == 2:
                    name_key, name_key_verbose = name
                    if hasattr(self, name_key):
                        data[name_key] = to_ascii(getattr(self, name_key))
                    else:
                        data[name_key] = to_ascii(getattr(r, name_key))
                elif isinstance(name, basestring) and hasattr(self, name):
                    # This is likely a ModelAdmin method name.
                    name_key = name
                    data[name_key] = to_ascii(getattr(self, name)(r))
                elif isinstance(name, basestring) and hasattr(r, name):
                    name_key = name
                    data[name_key] = to_ascii(getattr(r, name))
                else:
                    name_key = name
                    data[name_key] = to_ascii(get_attr(r, name))
                    
                if callable(data[name_key]):
                    data[name_key] = to_ascii(data[name_key]())
                
                if self.csv_remove_html:
                    data[name_key] = utils.remove_html(data[name_key])
                    
            #print('data:',data
            writer.writerow(data)
        return response
コード例 #9
0
    def score(self,
              text,
              tests=["TOXICITY"],
              context=None,
              languages=None,
              do_not_store=False,
              token=None,
              text_type=None):
        # data validation
        # make sure it's a valid test
        # TODO: see if an endpoint that has valid types exists
        if isinstance(tests, str):
            tests = [tests]
        if not isinstance(tests, (list, dict)) or tests is None:
            raise ValueError("Invalid list/dictionary provided for tests")
        if isinstance(tests, list):
            new_data = {}
            for test in tests:
                new_data[test] = {}
            tests = new_data
        if text_type:
            if text_type.lower() == "html":
                text = remove_html(text)
            elif text_type.lower() == "md":
                text = remove_html(text, md=True)
            else:
                raise ValueError(
                    "{0} is not a valid text_type. Valid options are 'html' or 'md'"
                    .format(str(text_type)))

        for test in tests.keys():
            if test not in allowed:
                warnings.warn(
                    "{0} might not be accepted as a valid test.".format(
                        str(test)))
            for key in tests[test].keys():
                if key not in ["scoreType", "scoreThreshhold"]:
                    raise ValueError(
                        "{0} is not a valid sub-property for {1}".format(
                            key, test))

        # The API will only grade text less than 3k characters long
        if len(text) > 3000:
            # TODO: allow disassembly/reassembly of >3000char comments
            warnings.warn(
                "Perspective only allows 3000 character strings. Only the first 3000 characters will be sent for processing"
            )
            text = text[:3000]
        new_langs = []
        if languages:
            for language in languages:
                language = language.lower()
                if validate_language(language):
                    new_langs.append(language)

        # packaging data
        url = Perspective.base_url + "/comments:analyze"
        querystring = {"key": self.key}
        payload_data = {"comment": {"text": text}, "requestedAttributes": {}}
        for test in tests.keys():
            payload_data["requestedAttributes"][test] = tests[test]
        if new_langs != None:
            payload_data["languages"] = new_langs
        if do_not_store:
            payload_data["doNotStore"] = do_not_store
        payload = json.dumps(payload_data)
        headers = {'content-type': "application/json"}
        response = requests.post(url,
                                 data=payload,
                                 headers=headers,
                                 params=querystring)
        data = response.json()
        if "error" in data.keys():
            raise PerspectiveAPIException(data["error"]["message"])
        c = Comment(text, [], token)
        base = data["attributeScores"]
        for test in tests.keys():
            score = base[test]["summaryScore"]["value"]
            score_type = base[test]["summaryScore"]["type"]
            a = Attribute(test, [], score, score_type)
            for span in base[test]["spanScores"]:
                beginning = span["begin"]
                end = span["end"]
                score = span["score"]["value"]
                score_type = span["score"]["type"]
                s = Span(beginning, end, score, score_type, c)
                a.spans.append(s)
            c.attributes.append(a)
        return c
コード例 #10
0
    def csv_export(self, request, qs=None, raw_headers=None):
        response = HttpResponse(mimetype='text/csv')
        response['Content-Disposition'] = 'attachment; filename=%s.csv' \
            % slugify(self.model.__name__)

        if raw_headers is None:
            raw_headers = self.get_csv_raw_headers(request)

        def get_attr(obj, name, as_name=False):
            """
            Dereferences "__" delimited variable names.
            """
            parts = name.split('__')
            cursor = obj
            for part in parts:
                name = part
                cursor = getattr(cursor, part, None)
                if callable(cursor):
                    cursor = cursor()
            if cursor == obj:
                return
            if as_name:
                return name
            return cursor

        # Write header.
        header_data = {}
        fieldnames = []
        header_names = self.get_csv_header_names(request)

        # Write records.
        first = True
        qs = self.get_csv_queryset(request, qs)
        for r in qs[:self.csv_record_limit]:

            if first:
                first = False
                if not raw_headers:
                    if self.csv_headers_all and isinstance(r, dict):
                        if isinstance(qs, utils.DictCursor):
                            raw_headers = qs.field_order
                        else:
                            raw_headers = r.keys()
                    else:
                        raise Exception('No headers specified.')
                for name in raw_headers:
                    if name in header_names:
                        name_key = name
                        header_data[name] = header_names.get(name_key)
                    elif callable(name):
                        # This is likely a Formatter instance.
                        name_key = name.name
                        header_data[name_key] = name.short_description
                    elif isinstance(name, (tuple, list)) and len(name) == 2:
                        name_key, name_key_verbose = name
                        header_data[name_key] = name_key_verbose
                    elif isinstance(name, basestring) and hasattr(self, name):
                        # This is likely a ModelAdmin method name.
                        name_key = name
                        header_data[name_key] = getattr(self,
                                                        name).short_description
                    elif hasattr(name, 'short_description'):
                        name_key = name
                        header_data[name_key] = getattr(
                            name, 'short_description')
                    elif hasattr(self.model, name):
                        name_key = name
                        if hasattr(getattr(self.model, name),
                                   'short_description'):
                            header_data[name_key] = getattr(
                                getattr(self.model, name), 'short_description')
                        else:
                            header_data[name_key] = name
                    else:
                        name_key = name
                        header_data[
                            name_key] = name_key  #get_attr(r, name, as_name=True)
#                        field = self.model._meta.get_field_by_name(name)
#                        if field and field[0].verbose_name:
#                            header_data[name_key] = field[0].verbose_name
#                        else:
#                            header_data[name_key] = name
                    header_data[name_key] = header_data[name_key].title()
                    fieldnames.append(name_key)

                writer = csv.DictWriter(response,
                                        fieldnames=fieldnames,
                                        quoting=self.csv_quoting)
                writer.writerow(header_data)
            #print('fieldnames:',fieldnames
            data = {}
            for name in raw_headers:
                obj = None
                if isinstance(r, dict):
                    if name in r:
                        data[name] = r[name]
                        #                    print('skipping:',name
                        continue


#                    elif 'id' in r:
#                        obj = self.model.objects.get(id=r['id'])

#                print('model:',self.model
#                print('r:',r
#                print('name:',name,isinstance(name, basestring) and hasattr(r, name)
                if callable(name):
                    # This is likely a Formatter instance.
                    name_key = name.name
                    if hasattr(name, 'plaintext'):
                        data[name_key] = to_ascii(name(r, plaintext=True))
                    else:
                        data[name_key] = to_ascii(name(r))
                elif isinstance(name, (tuple, list)) and len(name) == 2:
                    name_key, name_key_verbose = name
                    if hasattr(self, name_key):
                        data[name_key] = to_ascii(getattr(self, name_key))
                    else:
                        data[name_key] = to_ascii(getattr(r, name_key))
                elif isinstance(name, basestring) and hasattr(self, name):
                    # This is likely a ModelAdmin method name.
                    name_key = name
                    data[name_key] = to_ascii(getattr(self, name)(r))
                elif isinstance(name, basestring) and hasattr(r, name):
                    name_key = name
                    data[name_key] = to_ascii(getattr(r, name))
                else:
                    name_key = name
                    data[name_key] = to_ascii(get_attr(r, name))

                if callable(data[name_key]):
                    data[name_key] = to_ascii(data[name_key]())

                if self.csv_remove_html:
                    data[name_key] = utils.remove_html(data[name_key])

            #print('data:',data
            writer.writerow(data)
        return response