Esempi in Python per Cleaner, esempi in Python per utils.Cleaner

Esempio n. 1

0

Mostra file

File: Crawler.py Progetto: brucechin/zhihu-monitoring

    def get_questions_detail(self):  #只搜集符合一定条件的问题详细内容，如关注者/回答数量等超过一定值，但本值不应较大
        #如果mongodb中已经存了回答内容，那么也不考虑作者更新回答的情况，因为答案通常较长，IO成本较高
        all_answers_id = pd.DataFrame(list(self.db_[TOPICS_COLLECTION].find()))
        for answers in all_answers_id['aids']:  #提取每个问题对应的所有回答id
            for ans_id in answers:

                ans = self.client_.answer(ans_id)
                ans_content = Cleaner.filter_tags(ans.content)
                comment_content = ""
                if (ans.comment_count > 0):
                    for comment in ans.comments:
                        comment_content += Cleaner.filter_tags(
                            comment.content) + "@"  #用@分割，以后可以选择直接测情绪也可以每句评论分开测
                time.sleep(random.randint(1, 4))
                print("ans {} done retrival and cleaning".format(ans.id))

                ans_detail = {
                    'aid': ans_id,
                    'votes': ans.voteup_count,
                    'content': ans_content,
                    'comments': comment_content,
                    'author_follower_num': ans.author.follower_count
                }

                self.db_[ANSWERS_COLLECTION].insert_one(ans_detail)

Esempio n. 2

0

Mostra file

def CountModel(StatusDataRdd):
    # Create or get spark and SQL Context and broadcast stop words, Apostrophes word and life events list.
    SparkContext = ContextProvider.getSparkInstance()
    SqlContext = ContextProvider.getSQLContext()
    StopWordsList = SparkContext.broadcast(FileContentLoader.LoadStopWords())
    ApostrophesReplaceList = SparkContext.broadcast(
        FileContentLoader.LoadApostrophesReplaceWords())
    LifeEventsList = SparkContext.broadcast(FileContentLoader.LifeEventsList())

    # Clean the tweet text and get a barebone words list with only very important words
    StatusData = StatusDataRdd.collect()[0]
    ClanedTweetText = Cleaner.ReplaceApostrophes(
        ApostrophesReplaceList.value,
        Cleaner.RemovePunctuations(StatusData["text"]))
    BareTweetWords = Cleaner.RemoveStopWords(StopWordsList.value,
                                             ClanedTweetText).split()
    DetectedTopic = []
    # Get the tweets date
    TweetDate = datetime.strptime(
        StatusData["created_at"][:19] + StatusData["created_at"][25:],
        '%a %b %d %X %Y')

    # Identify if a tweet belongs to a particular life event
    for Event in LifeEventsList.value["LifeEventList"]:
        if Event["Event"] in BareTweetWords or Event[
                "StemWords"] in BareTweetWords:
            DetectedTopic.append({
                "Event": Event["Event"],
                "date": TweetDate.strftime('%m/%d/%Y'),
                "Id": StatusData["id"]
            })

    # saves none if a tweet does not belongs to any life event
    if DetectedTopic == []:
        DetectedTopic.append({
            "Event": "None",
            "date": TweetDate.strftime('%m/%d/%Y'),
            "Id": StatusData["id"]
        })

    # Saves output in a parquet file
    TweetDataFrame = SqlContext.createDataFrame(
        SparkContext.parallelize(DetectedTopic))
    TweetDataFrame.coalesce(
        ConfigProvider.MaxPartFiles).write.mode('append').parquet(
            ConfigProvider.OutputParquetFilePath)


# for saving to HBase
# 	HBaseInsert = []
# 	for Topic in DetectedTopic:
# 		HBaseInsert += [(Topic["Id"] ,[Topic["Id"], "cf", "Topic", Topic["topic"]])]
# 		HBaseInsert += [(Topic["Id"] ,[Topic["Id"], "cf", "Date", Topic["date"]])]
# 	print HBaseInsert
# 	HbaseSave.SaveRecord(SparkContext.parallelize(HBaseInsert), "LifeEventCount")

Esempio n. 3

0

Mostra file

    def __init__(self, file_name, verbose=False):
        self.cleaner = Cleaner()
        self.cleaner.clean(file_name)

        self.nlp = NLP()

        self.data = []
        self.file_name = file_name

        self.cnt_dirty_data = 0
        self.cnt_corefs = 0

        with open(file_name, encoding="utf-8") as f:
            content = f.read()

        try:
            structure = xmltodict.parse(content)
        except:
            show_var(["file_name"])
            import pdb

            pdb.set_trace()
        for entry_ix, entry in enumerate(
                self._triples_from_obj(structure["benchmark"]["entries"],
                                       "entry")):
            self.entry_ix = entry["@eid"]

            triplets = [
                tuple(map(str.strip,
                          r.split("|"))) for r in self._triples_from_obj(
                              entry["modifiedtripleset"], "mtriple")
            ]

            entitymaps = dict([
                tuple(map(str.strip, entitymap.split("|")))
                for entitymap in self._triples_from_obj(
                    entry["entitymap"], "entity")
            ])

            sentences = list(self.extract_sentences(entry["lex"]))

            for s_tripleset, text, template, ner2ent in sentences:
                self.data.append({
                    # 'rdfs': triplets,
                    "triples": s_tripleset,
                    "target": template,
                    "target_txt": text,
                    "ner2ent": ner2ent,
                })
        if verbose and self.cnt_dirty_data:
            show_var(["self.cnt_dirty_data"])
        if verbose and self.cnt_corefs:
            show_var(["self.cnt_corefs"])

Esempio n. 4

0

Mostra file

def processTweets(RawTweet):
    try:  
        RawTweet["text"] = RawTweet["text"].encode("ascii","ignore")
        # Remove Symbols from tweet text 
        for symbol in RawTweet["entities"]["symbols"]:
            StartIndex = int(symbol["indices"][0])
            EndIndex = int(symbol["indices"][1])
            ReplaceLen = EndIndex - StartIndex
            RawTweet["text"] = RawTweet["text"][ : StartIndex ] + ReplaceLen * " " +  RawTweet["text"][EndIndex : ]
        # Remove Hashtags from tweet text 
        for hashtag in RawTweet["entities"]["hashtags"]:
            StartIndex = int(hashtag["indices"][0])
            EndIndex = int(hashtag["indices"][1])
            ReplaceLen = EndIndex - StartIndex
            RawTweet["text"] = RawTweet["text"][ : StartIndex ] + ReplaceLen * " " + RawTweet["text"][EndIndex : ]
        # Remove Url's from tweet text
        for url in RawTweet["entities"]["urls"]:
            StartIndex = int(url["indices"][0])
            EndIndex = int(url["indices"][1])
            ReplaceLen = EndIndex - StartIndex
            RawTweet["text"] = RawTweet["text"][ : StartIndex ] + ReplaceLen * " " + RawTweet["text"][EndIndex : ]
        # Remove media content like image links from tweet text
        if "media" in RawTweet["entities"] and RawTweet["entities"]["media"] is not None:
            for MediaUrl in RawTweet["entities"]["media"]:
                RawTweet["text"] = RawTweet["text"].replace(MediaUrl["url"],"")
        # Replaces user mentions with their screen names 
        for user in RawTweet["entities"]["user_mentions"]:
            RawTweet["text"] = RawTweet["text"].replace("@"+user["screen_name"] + ":" , " " + user["name"].encode("ascii","ignore") + " ")
        # Performs a general text cleaning removing unwanted characters 
        RawTweet["text"] = Cleaner.TextCleaner(RawTweet["text"])
        return RawTweet
    except:
        print "Error cleaning tweets from json file\n " , sys.exc_info()[0]

Esempio n. 5

0

Mostra file

File: latexerapi.py Progetto: MennaAbdallah/latexer

def latexerapi():
    image = request.args.get("image")
    try:
        expr = pytesseract.image_to_string(
            image, lang='eng', output_type=pytesseract.Output.DICT)['text']
        logger.debug('Opened {}.'.format(image))
    except:
        logger.error(
            'Failed to open {}, make sure you\'ve installed the tesseract-ocr.'
            .format(image))
        return json.dumps({
            'image': image,
            'status': 'failed',
            'latex': ''
        },
                          indent=4)

    res = Latexer(Cleaner(expr))
    return json.dumps({
        'image': image,
        'status': res.status,
        'latex': str(res)
    },
                      indent=4)

Esempio n. 6

0

Mostra file

class RDFFileReader:
    def __init__(self, file_name, verbose=False):
        self.cleaner = Cleaner()
        self.cleaner.clean(file_name)

        self.nlp = NLP()

        self.data = []
        self.file_name = file_name

        self.cnt_dirty_data = 0
        self.cnt_corefs = 0

        with open(file_name, encoding="utf-8") as f:
            content = f.read()

        try:
            structure = xmltodict.parse(content)
        except:
            show_var(['file_name'])
            import pdb
            pdb.set_trace()
        for entry_ix, entry in enumerate(
                self._triples_from_obj(structure["benchmark"]["entries"],
                                       "entry")):
            self.entry_ix = entry['@eid']

            triplets = [
                tuple(map(str.strip,
                          r.split("|"))) for r in self._triples_from_obj(
                              entry["modifiedtripleset"], "mtriple")
            ]

            entitymaps = dict([
                tuple(map(str.strip, entitymap.split("|")))
                for entitymap in self._triples_from_obj(
                    entry['entitymap'], 'entity')
            ])

            sentences = list(self.extract_sentences(entry["lex"]))

            for s_tripleset, text, template, ner2ent in sentences:
                self.data.append({
                    # 'rdfs': triplets,
                    'triples': s_tripleset,
                    'target': template,
                    'target_txt': text,
                    'ner2ent': ner2ent,
                })
        if verbose and self.cnt_dirty_data: show_var(["self.cnt_dirty_data"])
        if verbose and self.cnt_corefs: show_var(["self.cnt_corefs"])

    @staticmethod
    def _triples_from_obj(obj, t_name):
        def _triples_fix(triplets):
            if not isinstance(triplets, list):
                return [triplets]
            else:
                return map(lambda t: t, triplets)

        if not isinstance(obj, list):
            if obj is not None:
                if t_name in obj:
                    return _triples_fix(obj[t_name])
            return []
        else:
            return [_triples_fix(o[t_name]) for o in obj]

    def extract_sentences(self, lex):
        sentences = lex
        if not isinstance(sentences, list): sentences = [sentences]

        for s in sentences:
            if s['@comment'] == 'bad': continue

            template = s['template']
            text = s['text']
            tag2ent = dict([
                (r['@tag'], r['@entity'])
                for r in self._triples_from_obj(s['references'], 'reference')
            ])
            s_tripleset_raw = [[
                tuple(map(str.strip, r.split("|")))
                for r in self._triples_from_obj(s_triples, 'striple')
            ]
                               for s_triples in self._triples_from_obj(
                                   s["sortedtripleset"], 'sentence')
                               if s_triples]
            fixed = self.fix_document(s_tripleset_raw, template, text, tag2ent)
            if fixed is None: continue
            s_tripleset, template, text, tag2ent = fixed

            if len(s_tripleset) == 1:
                template = [template]
                text = [text]
            else:
                template = self.nlp.sent_tokenize(template)
                text = self.nlp.sent_tokenize(text)
                text = fix_tokenize(text)

            if len({len(template), len(text), len(s_tripleset)}) != 1:
                # import pdb;
                # pdb.set_trace()
                self.cnt_dirty_data += 1
                continue

            for s_t, tex, tem in zip(s_tripleset, text, template):

                new_s_t, tem, uniq_tag2ent = \
                    self.fix_sentence(s_t, tem, tag2ent)
                if not (new_s_t and tem and tex and uniq_tag2ent):
                    self.cnt_corefs += 1
                    # import pdb;pdb.set_trace()
                    continue

                yield new_s_t, tex, tem, uniq_tag2ent

    def fix_document(self, s_tripleset_raw, template, text, tag2ent):
        # check template
        template = ' '.join(
            [fix_template_word[word] if word in fix_template_word else word
             for word in template.split()]) \
            if template else template

        # tokenization
        text = self.nlp.word_tokenize(text)
        template = self.nlp.word_tokenize(template)

        # clean s_tripleset
        s_tripleset = [s for s in s_tripleset_raw if s]
        self.cnt_dirty_data += len(s_tripleset_raw) - len(s_tripleset)

        if (not tag2ent) or (not s_tripleset):
            self.cnt_dirty_data += not tag2ent
            return None

        # fix this case "same entity has different ners BRIDGE-1 PATIENT-1"
        ent2tags = defaultdict(list)
        for tag, ent in tag2ent.items():
            ent2tags[ent] += [tag]
        tag2uniq_tag = {}
        for ent, tags in ent2tags.items():
            for tag in tags:
                tag2uniq_tag[tag] = tags[0]
        uniq_tag2ent = {
            tag: ent
            for tag, ent in tag2ent.items() if tag in tag2uniq_tag.values()
        }
        for tag, uniq_tag in tag2uniq_tag.items():
            template = template.replace(tag, uniq_tag)

        assert uniq_tag2ent
        ent2uniq_tag = {v: k for k, v in uniq_tag2ent.items()}
        assert len(ent2uniq_tag) == len(uniq_tag2ent)

        # clean out extra quotes around entity names
        uniq_tag2ent = {k: v.strip('\"') for k, v in uniq_tag2ent.items()}
        try:
            s_tripleset = [[(subj.strip('\"'), predi, obj.strip('\"'))
                            for subj, predi, obj in s_triples]
                           for s_triples in s_tripleset]
        except:
            import pdb
            pdb.set_trace()

        # replaces '-' with '_' only in entity types
        tags = set(uniq_tag2ent.keys())
        for tag in tags:
            template = template.replace(tag, tag.replace('-', '_'))
        template = template.replace('BRIDGE-', 'BRIDGE_')
        template = template.replace('AGENT-', 'AGENT_')
        template = template.replace('PATIENT-', 'PATIENT_')
        uniq_tag2ent = {
            k.replace('-', '_'): v
            for k, v in uniq_tag2ent.items()
        }

        return s_tripleset, template, text, uniq_tag2ent

    def fix_sentence(self, s_tripleset, template, tag2ent):
        ent2tags = {v: k for k, v in tag2ent.items()}

        # s_tripleset must meet "head && tail are in template && tag2ent"
        bad_triples = set()
        for triple_ix, triple in enumerate(s_tripleset):
            for ent in [triple[0], triple[-1]]:
                if ent in ent2tags:
                    if ent2tags[ent] not in template:
                        bad_triples.add(triple_ix)
                        continue
                else:
                    bad_triples.add(triple_ix)
                    continue
        s_tripleset = [
            triple for triple_ix, triple in enumerate(s_tripleset)
            if triple_ix not in bad_triples
        ]

        # tag2ent are entities only in triple_entities
        triple_entities = set(
            flatten_list([(triple[0], triple[-1]) for triple in s_tripleset]))
        tag2tri_ent = {
            k: v
            for k, v in tag2ent.items() if v in triple_entities
        }

        # templates only have triple_entities
        for tag, ent in tag2ent.items():
            if ent not in triple_entities:
                ent = ent.replace('_', ' ')
                template = template.replace(tag, ent)

        if {word for word in template.split()
            if 'AGENT' in word or 'BRIDGE' in word or 'PATIENT' in word} \
                != set(tag2tri_ent.keys()):
            self.cnt_corefs += 1
        assert set(tag2tri_ent.values()) == triple_entities
        '''
        TODO: 
        Erroraneous case:
        train.csv:7123:"Ayam penyet	mainIngredients	Squeezed"" or ""smashed"" fried chicken served with sambal",PATIENT_2 is PATIENT_3 .,"Fried chicken is Squeezed"" or ""smashed"" fried chicken served with sambal .",The chicken is smashed and served hot with sambal .,"Ayam penyet	Fried chicken	Squeezed"" or ""smashed"" fried chicken served with sambal",AGENT_1 PATIENT_2 PATIENT_3,ROOT	mainIngredients	mainIngredients_inv,mainIngredients,"[0, 2]","[2, 2, 8]","{""AGENT_1"": ""Ayam penyet"", ""PATIENT_2"": ""Fried chicken"", ""PATIENT_3"": ""Squeezed\"" or \""smashed\"" fried chicken served with sambal""}","[[0, 4], [4, 2], [2, 5], [5, 0]]","Ayam penyet <ENT_SEP> Fried chicken <ENT_SEP> Squeezed"" or ""smashed"" fried chicken served with sambal <ENT_REL_SEP> mainIngredients <REL_TRP_SEP> 0 2 0","Ayam penyet	mainIngredients	Squeezed"" or ""smashed"" fried chicken served with sambal <ENT_TGT_SEP> PATIENT_2 is PATIENT_3 . <TGT_TXT_SEP> The chicken is smashed and served hot with sambal ."
        train.csv:7359:Bakewell tart	ingredient	Frangipane,AGENT_1 contains PATIENT_3 .,Bakewell pudding contains Frangipane .,It contains frangipane .,Bakewell pudding	Bakewell tart	Frangipane,AGENT_1 BRIDGE_2 PATIENT_3,ROOT	ingredient	ingredient_inv,ingredient,"[1, 2]","[2, 2, 1]","{""AGENT_1"": ""Bakewell pudding"", ""BRIDGE_2"": ""Bakewell tart"", ""PATIENT_3"": ""Frangipane""}","[[1, 4], [4, 2], [2, 5], [5, 1]]",Bakewell pudding <ENT_SEP> Bakewell tart <ENT_SEP> Frangipane <ENT_REL_SEP> ingredient <REL_TRP_SEP> 1 2 0,Bakewell tart	ingredient	Frangipane <ENT_TGT_SEP> AGENT_1 contains PATIENT_3 . <TGT_TXT_SEP> It contains frangipane .
        {
            "sent": "demarce short stories in the the grantville gazettes precede eric flint novels .",
            "graph": [
                {
                    "truth": "precededBy",
                    "pred": "precededBy",
                    "ent0_ent1": "1634: the bavarian crisis ENT0_END demarce short stories in the the grantville gazettes"
                },
                {
                    "truth": "<unk>",
                    "pred": "author",
                    "ent0_ent1": "1634: the bavarian crisis ENT0_END eric flint"
                }
            ]
        }
        '''
        return s_tripleset, template, tag2tri_ent

Esempio n. 7

0

Mostra file

File: read.py Progetto: brucechin/zhihu-monitoring

#df = pd.DataFrame(topic_questions_detail,columns=['tid','qid','aids'])

mogo_client = MongoClient('mongodb://localhost:27017/')
db = mogo_client['test']
col = db['questions_detail']

print(col.count())
#target = col.find_one({'tid' : 19575211 })
target = pd.DataFrame(list(col.find()))
#print(target.describe())
print(target)
for answers in target['aids']:
    for ans_id in answers:
        ans = client.answer(ans_id)
        print(Cleaner.filter_tags(ans.content))
        time.sleep(3)




# for q in topic.unanswered_questions:
#     if(q.follower_count > 1000):
#         #print("question {}, created at {}, has {} followers, {} answers\n".format(q.title, datetime.utcfromtimestamp(q.created_time).strftime('%Y-%m-%d %H:%M:%S'), q.follower_count, q.answer_count))
#         for ans in q.answers:
#             for com in ans.comments:
#                 print("question {} - answer {} {}- comments {}\n".format(q.id, ans.id, ans.content, com.content))

Esempio n. 8

0

Mostra file

                        '-im',
                        type=str,
                        required=True,
                        help='Input Image')
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='Show debugging data.')
    args = parser.parse_args()

    if not args.verbose:

        logger.disable('utils')
        logger.disable('__main__')

    try:
        expr = pytesseract.image_to_string(
            args.image, lang='eng',
            output_type=pytesseract.Output.DICT)['text']
        logger.debug('Opened {}.'.format(args.image))
    except:
        logger.error(
            'Failed to open {}, make sure you\'ve installed the tesseract-ocr.'
            .format(args.image))
        exit(
            'Failed to open {}, make sure you\'ve installed the tesseract-ocr.'
        )

    res = Latexer(Cleaner(expr))
    print(res)