Esempio n. 1
0
 def _get_doc_info(self, obj):
     d = self._get_general_info(obj)
     fields = {}
     fields['description'] = obj.description
     fields['text'] = dehtml(obj.getText())
     d['fields'] = fields
     return d 
Esempio n. 2
0
def raw_message_to_obj(response):
    global service
    obj = collections.OrderedDict()

    # print(response.keys())
    # print(response['payload'].keys())
    # print(response['payload']['headers'])



    fields = ['Subject', 'Date', 'From', 'To', 'Cc', 'Message-ID']

    try:
        ## THIS IS WHERE THINGS HAPPEN
        for f in fields[:1]:
            v = [x['value'] for x in response['payload']['headers'] if x['name'] == f]
            obj[f] = ''.join(v) #if v is empty array, resolves to empty string
        obj['snippet'] = dehtml.dehtml(response['snippet'])
        # we do this because obj is an ordered dict and we want subject, then snippet
        for f in fields[1:]:
            v = [x['value'] for x in response['payload']['headers'] if x['name'] == f]
            obj[f] = ''.join(v) #if v is empty array, resolves to empty string


        message = parse_multipart_message(response) #UNCOMMENT THIS IF NECESSARY
        obj['message'] = message

    except Exception as error:
        print('An Error occurred: %s' % error)
    return obj
Esempio n. 3
0
 def get(self):
     format = self.request.get("format")
     hook  = db.GqlQuery(self.__queryString())
     #find a way to redo until data is returned
     while(hook.count() == 0):
             hook = db.GqlQuery(self.__queryString())
     soup = BeautifulSoup.BeautifulSoup(hook[0].text.replace(";", ","))
     ash = soup.findAll("a")
     ashlinks = []
     for a in ash:
         metadict = {'metaurl' : "http://en.wikipedia.org"+a["href"],
                     'metatext' : a.text }
         ashlinks.append(metadict)
     tex = dehtml.dehtml(hook[0].text)
     tex = tex.replace("... that ","",1).replace(';',',').rstrip("?")
     texlt = tex.split(" ",1)
     tex = texlt[0].capitalize()+" "+texlt[1]
     responseData = {"response": [{"hook":{
                 "title" : unquote(hook[0].link.replace(";", ",")),
                 "text" : tex,
                 "metadata": ashlinks}}]}
     if format == "json":
         self.ReturnJSON(responseData)
     elif format == "xml":
         self.ReturnXML(responseData)
     else:
         self.response.out.write("Incompatible format or No format specified!")
 def fill_content(self):
     assert (self.kind)
     plain_texts = []
     word_texts = []
     html_texts = []
     has_attachment = False
     for part in self.envelope_object.walk():
         if part.get_content_type() == 'message/delivery-status':
             self.save()
             raise Exception('Bounce email.'.format(self))
         elif not part.is_multipart():
             if part.get_content_type() == 'text/plain':
                 charset = part.get_content_charset()
                 payload = part.get_payload(decode=True)
                 if payload != '':
                     plain_texts.append(payload.decode(charset))
             elif part.get_content_type() == 'text/html':
                 charset = part.get_content_charset()
                 payload = part.get_payload(decode=True)
                 if payload != '':
                     html_texts.append(dehtml(payload.decode(charset)))
             elif self.kind == 'response':
                 # Only accept attachments from representatives.
                 if part.get_content_type() == 'application/msword':
                     word_texts.append(
                         antiword.antiword_string(
                             part.get_payload(decode=True)).replace(
                                 '[pic]', '').replace('|', ''))
                 if part.get_content_type() in ATTACHMENT_MIMETYPES:
                     has_attachment = True
                     attachment = Attachment(
                         mimetype=part.get_content_type(),
                         original_filename=part.get_filename(),
                         message=self,
                     )
                     attachment.set_content(part.get_payload(decode=True))
                     attachment.save()
                     self.parent.is_locked = True
                     self.parent.save()
             else:
                 logger.warning(
                     u'Skipping attachment {} ({}) in {} {}'.format(
                         part.get_filename(),
                         part.get_content_type(),
                         self.kind,
                         self.id,
                     ))
     if not (plain_texts or html_texts or word_texts or has_attachment):
         raise Exception("Couldn't extract any content")
     body_text = '\n\n***\n\n'.join((plain_texts or html_texts) +
                                    word_texts)
     self.body_text = utils.remove_consequentive_empty_lines(
         utils.remove_reply_email(body_text))
Esempio n. 5
0
 def setTitle(self, fulltext=None):
     if fulltext: pass
     else:
         try:
             with open(self.path, 'r') as f:
                 fulltext = f.read()
         except Exception as e:
             print(str(e))
     html = False
     try:
         html = (fulltext.split('\n', 1)[0].split()[0] == '<!DOCTYPE')
     except:
         pass
     if html: fulltext = dehtml.dehtml(fulltext)
     self.title = fulltext.strip().split('\n', 1)[0][:MAX_TITLE_LEN]
     if not self.title: self.title = "Untitled"
     self.setText(self.title)
     return self.title
Esempio n. 6
0
 def process_event(self, event):
     self.logger.debug("Received event: %r", event)
     action = event['action']
     if action == Chatter.Actions.GET_READY:
         self.cid = event['cid']
         self.send_data(action=Chatter.Actions.SET_READY)
     elif action == Chatter.Actions.START_CHAT:
         self.connected = True
         self.send_unsent_messages()
         self.on_start_chat()
     elif action == Chatter.Actions.STOP_CHAT:
         self.disconnected = True
         self.on_stop_chat()
     elif action in (Chatter.Actions.START_TYPING, Chatter.Actions.STOP_TYPING):
         self.on_typing(started=(action == Chatter.Actions.START_TYPING))
     elif action == Chatter.Actions.NEW_MESSAGE:
         if event['user'] != Chatter.USER_ME:
             self.on_message(dehtml.dehtml(event['message']))
     elif action == Chatter.Actions.PING:
         self.on_ping()
     else:
         self.logger.error("Unknown event action: %r", event)
Esempio n. 7
0
def get_all_threads(service, querystring):
    all_threads = []
    try:
        thread_count = 0
        start = True
        while start or 'nextPageToken' in response:
            if start:
                page_token = None
                start = False
            else:
                page_token = response['nextPageToken']

            response = service.users().threads().list(userId='me', pageToken=page_token, q=querystring).execute()
            if 'threads' in response:
                thread_count += len(response['threads'])
                print ("  == Loading ", thread_count, "threads")
                for thread in response['threads']:
                    thread['snippet'] = dehtml.dehtml(thread['snippet'])
                    print(thread)
                    all_threads.append(thread)
    except errors.HttpError as error:
            print('An HTTPError occurred: %s' % error)
    return all_threads
def tfidfScores(faceUrls, noFaceUrls, faceSrcs, noFaceSrcs, sourceContent):
    print "srcs: " + str(noFaceSrcs)
    print "getting tfidf scores"
    sortedSrcs = []
    if faceUrls is not None and faceSrcs is not None:
        faceDocuments = []
        for i in range(len(faceUrls)):
            try:
                # get html from search result url
                response = requests.get(faceUrls[i])

                #don't consider url if we don't get a good response
                if response.status_code == 200 and response.text != '':
                    #try to decode
                    encoding = response.encoding
                    text = response.content.decode(encoding)

                    #clean the javascript and css from the html file
                    cleaner = Cleaner()
                    cleaner.javascript = True
                    cleaner.style = True

                    cleaned = cleaner.clean_html(text)

                    #try to extract only the text from the remaining html
                    try:
                        parsed = (dehtml.dehtml(cleaned))
                    except UnicodeDecodeError:
                        print "UnicodeDecodeError"
                        # discard this url
                        continue

                    #lowercase it and remove punctuation
                    lowers = parsed.lower()
                    if type(lowers) is unicode:
                        ascii = unicodedata.normalize("NFKD", lowers).encode(
                            "ascii", "ignore")
                        noPunct = ascii.translate(None, string.punctuation)
                    elif type(lowers) is str:
                        noPunct = lowers.translate(None, string.punctuation)
                    faceDocuments.append(noPunct)
            except:
                pass

        #lowercase the source content (which has already had punctuation removed)
        lowers = sourceContent.lower()
        faceDocuments.insert(0, lowers)

        #tfidf on the documents (search results along with source content)
        tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words="english")
        faceTfs = tfidf.fit_transform(faceDocuments)

        #cosine similarity
        faceSimilarity = faceTfs * faceTfs.T

        #convert 0th row into an array (document similarities to the source content)
        similarities = []
        print("building similarities")
        print("face similarity length: %d" % faceSimilarity.get_shape()[1])
        for i in range(faceSimilarity.get_shape()[1]):
            similarities.append(faceSimilarity[0, i])

        #sort the sources by decreasing cosine similarity
        indices = [
            i[0] for i in sorted(
                enumerate(similarities), key=lambda x: x[1], reverse=True)
        ]

        for i in range(len(indices)):
            if len(faceSrcs) > indices[i]:
                sortedSrcs.append(faceSrcs[indices[i]])

    #same as above but for no-face-detection urls
    noFaceDocuments = []
    for i in range(len(noFaceUrls)):
        try:
            response = requests.get(noFaceUrls[i])

            if response.status_code == 200 and response.text != '':
                encoding = response.encoding
                text = response.content.decode(encoding)

                cleaner = Cleaner()
                cleaner.javascript = True
                cleaner.style = True

                cleaned = cleaner.clean_html(response.content)

                try:
                    parsed = (dehtml.dehtml(cleaned))
                except UnicodeDecodeError:
                    print "UnicodeDecodeError"
                    continue

                lowers = parsed.lower()
                if type(lowers) is unicode:
                    ascii = unicodedata.normalize("NFKD", lowers).encode(
                        "ascii", "ignore")
                    noPunct = ascii.translate(None, string.punctuation)
                elif type(lowers) is str:
                    noPunct = lowers.translate(None, string.punctuation)
                noFaceDocuments.append(noPunct)
        except:
            pass

    lowers = sourceContent.lower()
    noFaceDocuments.insert(0, lowers)

    tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words="english")
    noFaceTfs = tfidf.fit_transform(noFaceDocuments)

    noFaceSimilarity = noFaceTfs * noFaceTfs.T

    similarities = []
    print("building similarities")
    print("face similarity length: %d" % noFaceSimilarity.get_shape()[1])
    for i in range(noFaceSimilarity.get_shape()[1]):
        similarities.append(noFaceSimilarity[0, i])

    indices = [
        i[0] for i in sorted(
            enumerate(similarities), key=lambda x: x[1], reverse=True)
    ]

    for i in range(len(indices)):
        if len(noFaceSrcs) > indices[i]:
            sortedSrcs.append(noFaceSrcs[indices[i]])

    #end tfidf
    print "done doing tfidf"
    print "sortedSrcs: " + str(sortedSrcs)

    #return image sources sorted by url text cosine similarity to source content
    return sortedSrcs
Esempio n. 9
0
def eliminarTagsHTML(pDocumento):
    return dehtml.dehtml(pDocumento)
Esempio n. 10
0
while input != ';':
    i_list = input.split(', ')
    words += ([i_list[0], i_list[1]],)
    input = raw_input("Another word? ; to quit\n")

scores = {w: float(n) for w, n in words}
total = 0

output = open("results.txt", "w")

results = {}

for dirs, subdirs, files in os.walk(root):
    for f in files:
        if f.endswith('.rtf'):
            doc = Rtf15Reader.read(open(f, "rb"))
            total = 0
            text = dehtml(XHTMLWriter.write(doc).read().lower().split())
            for word in text:
                word = re.sub('\W+', '', word)
                total += scores.get(word, 0)

            results[f] = total

for key, value in sorted(results.items()):
    output.write(key + "       " + str(value) + "\n")

print "Finished! Check results.txt"
raw_input("\nPress enter to close.")