Ejemplo n.º 1
0
def get_code(email, upload_at):
    user_id = select_user_id(email)
    if user_id == '':
        data = {'email': '0', 'code': '0'}
        final = json.dumps(data, ensure_ascii=False).encode('utf8')
        return final
    else:
        code = select_code(user_id)
        if code == '':
            code = utils.create_code()
            query = "INSERT INTO code_user (user_id, code, upload_at) VALUES (%s,%s,%s);"
            data = (user_id, code, upload_at)
            conn.run_query(query, data)
            data = {'email': email, 'code': utils.remove_punctuation(str(code))}
            final = json.dumps(data, ensure_ascii=False).encode('utf8')
            mservice.send_email("*****@*****.**",
                               "Rodolfo123123",
                               email,
                               "Tu código es: "+str(code))
            return final
        else:
            data = {'email': email, 'code': utils.remove_punctuation(str(code))}
            final = json.dumps(data, ensure_ascii=False).encode('utf8')
            mservice.send_email("*****@*****.**",
                                "Rodolfo123123",
                                email,
                                "Tu código es: " + str(code))
            return final
Ejemplo n.º 2
0
 def search(paragraph):
     paragraph = split(lower(remove_punctuation(paragraph)))
     for i in topic:
         for j in paragraph:
             if i == j:
                 return True
     return False
Ejemplo n.º 3
0
 def start(paragraphs):
     for word in topic:
         s = split(remove_punctuation(lower(paragraphs)))
         for i in s:
             if i == word:
                 return True
     return False
Ejemplo n.º 4
0
 def about_helper(paragraph):
     paragraph = remove_punctuation(lower(paragraph)).split()
     # print(paragraph)
     for i in topic:
         if i in paragraph:
             return True
     return False
Ejemplo n.º 5
0
 def select(paragraph):
     remove_punctuation_paragraph = remove_punctuation(paragraph) # remove punction
     split_paragraph_list = split(remove_punctuation_paragraph)  #split paragraph
     for splited_paragraph in split_paragraph_list:  
         if lower(splited_paragraph) in topic:   # return a lowercased version 
             return True
     return False
Ejemplo n.º 6
0
 def is_about_topic(s):
     low_s = lower(remove_punctuation(s))
     for _s in split(low_s):
         for _topic in topic:
             if _s == _topic:
                 return True
     return False
Ejemplo n.º 7
0
 def helper(str):
     str = split(remove_punctuation(lower(str)))
     for i in range(0, len(str)):
         for j in range(0, len(topic)):
             if (str[i] == topic[j]):
                 return True
     return False
Ejemplo n.º 8
0
 def f(x):
     splitted = split(x)
     ls = [lower(remove_punctuation(s)) for s in splitted]
     for item in ls:
         if item in topic:
             return True
     return False
Ejemplo n.º 9
0
 def sentence_about(sentence):
     words = split(sentence)
     words = [lower(remove_punctuation(w)) for w in words]
     for w in words:
         if w in topic:
             return True
     return False
Ejemplo n.º 10
0
 def helpler(paragraph):
     new_para = split(lower(remove_punctuation(paragraph)))
     for i in new_para:
         for x in topic:
             if i == x:
                 return True
     return False
Ejemplo n.º 11
0
 def select(paragraphs):
     paragraphs = lower(paragraphs)
     paragraphs = remove_punctuation(paragraphs)
     list_paragraphs = split(paragraphs)
     for words in topic:
         if words in list_paragraphs:
             return True
     return False
Ejemplo n.º 12
0
def update_password(email, code, password):
    user_id = select_user_id(email)
    code_consult = select_code(user_id)
    code_consult = utils.remove_punctuation(str(code_consult))
    if code_consult == code:
        query = "UPDATE user SET password = %s WHERE user_id = %s;"
        data = (password, user_id)
        conn.run_query(query, data)
Ejemplo n.º 13
0
def calculate_word_vector_model(input_path, output_path=None):
    document = []
    for line in open(input_path):
        line = utils.remove_punctuation(line)
        cutted_line = jieba.cut(line)
        document.append(list(cutted_line))
    model = gensim.models.Word2Vec(document)
    model.save(output_path)
    return model
Ejemplo n.º 14
0
def calculate_word_vector_model(input_path,output_path = None):
    document = []
    for line in open(input_path):
        line =  utils.remove_punctuation(line)
        cutted_line = jieba.cut(line)
        document.append(list(cutted_line))
    model = gensim.models.Word2Vec(document)
    model.save(output_path)
    return model
Ejemplo n.º 15
0
 def f(p):
     p = remove_punctuation(p)
     p = lower(p)
     p = split(p)
     for i in p:
         for j in topic:
             if i == j:
                 return True
     return False
Ejemplo n.º 16
0
    def func(paragraph):
        paragraph = split(remove_punctuation(lower(paragraph)))

        filtered = [x for x in topic if x in paragraph]

        if filtered != []:
            return True
        else:
            return False
Ejemplo n.º 17
0
 def is_topic_mentioned(
         paragraph):  #parapraph is a string,choose from def choose
     lowered_paragraph = lower(paragraph)
     lowered_nopunc_paragraph = remove_punctuation(lowered_paragraph)
     lowered_nopunc_paragraph_list = split(lowered_nopunc_paragraph)
     for keyword in topic:
         if keyword in lowered_nopunc_paragraph_list:
             return True
     return False
Ejemplo n.º 18
0
    def valid_topic(paragraph):
        paragraph = remove_punctuation(paragraph)
        paragraph = lower(paragraph)
        split_paragraph = split(paragraph)

        for split_words in split_paragraph:
            if split_words in topic:
                return True
        return False
 def select(paragraph):
     paragraph = remove_punctuation(paragraph)
     paragraph = lower(paragraph)
     # list of all words in paragraph
     paragraph = split(paragraph)
     for word in topic:
         if (word in paragraph):
             return True
     return False
def make_id_for_entry(entry, style='gscholar'):
    """Take entry as a dict, and return an ID to use for the bib entry."""
    if style != 'gscholar':
        raise NotImplementedError('Not implemented yet.')
    try:
        entry['title']
    except KeyError:
        logger.info('Title entry missing. Could not create id for entry.')
        logger.info(entry)
        raise KeyError('I could not find title information from the given DOI/'
                       'arXiv. This often happens with books, which for some r'
                       'easons often do no include the title information.')
    try:
        entry['author']
        entry['year']
    except KeyError:
        # try to pull down additional information from google scholar
        logger.info('I could not find author/year information from DOI/arxiv,'
                    ' attempting to pull information down from gscholar.')
        gscholar_result = pull_info_from_gscholar(
            entry['title'], accepted_fields=['author', 'year'])
        if 'author' in gscholar_result and 'year' in gscholar_result:
            logger.info('Author/year information pulled from scholar.')
            entry.update(gscholar_result)
        else:
            raise KeyError("author, title and year are required.")
    title = entry['title']
    logger.info('I found the title "{}"'.format(title))
    year = entry['year']
    author = entry['author'].split(',')[0].lower()
    # extract first author
    if author[0] == '{':
        author = author[1:]
    if author[-1] == '}':
        author = author[:-1]
    if ' ' in author:
        author = author.split(' ')[0]
    # -- extract first word (looking at "words" with more than 3 chars) --
    # gather all words in the title
    words_in_title = re.findall(r'\S+', title)
    # remove punctuation from words
    words_in_title = [utils.remove_punctuation(w) for w in words_in_title]
    # filter words with less than 3 chars
    words_in_title = [w for w in words_in_title if len(w) > 3]
    # extract first word
    first_word = words_in_title[0].lower()
    if first_word[0] == '{':
        first_word = first_word[1:]
    if first_word[-1] == '}':
        first_word = first_word[:-1]
    if '-' in first_word:
        first_word = first_word.split('-')[0]
    # build new id
    newid = '{}{}{}'.format(author, year, first_word)
    logger.info('New id for the given entry: `{}`'.format(newid))
    return newid
Ejemplo n.º 21
0
 def tag_comments_test(self, comments):
     comments = utils.remove_punctuation(comments)
     phrase_tag = set()
     phrase_list = comments.split(' ')
     for p in phrase_list:
         for t in self.tags_repo:
             match_part, if_same = wordvec.compare_phrase(p.strip(), t.strip(),self.model)
             if if_same:
                 phrase_tag.add((p.strip(),t.strip()))
     return phrase_tag
Ejemplo n.º 22
0
 def select(string):
     # need to make lowercase and remove punctuation
     s = remove_punctuation(string)
     s = lower(s)
     # split for comparing
     s = split(s)
     for i in topic:
         if i in s:
             return True
     return False
Ejemplo n.º 23
0
def updatePassword(email, code, password):

    em = selectUserID(email)
    code_old = selectCode(em)
    code_old = utils.remove_punctuation(str(code_old))

    if (code_old == code):

        query = "UPDATE user SET password = %s WHERE user_id = %s"
        data = (password, em)
        conn.run_query(query, data)
Ejemplo n.º 24
0
 def tag_comments_test(self, comments):
     comments = utils.remove_punctuation(comments)
     phrase_tag = set()
     phrase_list = comments.split(' ')
     for p in phrase_list:
         for t in self.tags_repo:
             match_part, if_same = wordvec.compare_phrase(
                 p.strip(), t.strip(), self.model)
             if if_same:
                 phrase_tag.add((p.strip(), t.strip()))
     return phrase_tag
Ejemplo n.º 25
0
 def tag_comments_database(self, comments):
     comments = utils.remove_punctuation(comments)
     print comments
     phrase_tag = set()
     phrase_list = comments.split(' ')
     for p in phrase_list:
         for t in self.tags_repo:
             match_part, if_same = wordvec.compare_phrase(p, t, self.model)
             if if_same:
                 phrase_tag.add(t)
     return phrase_tag
Ejemplo n.º 26
0
 def __iter__(self):
     with open(ANT_NLP_FILE_PATH, "r", encoding="utf8") as atec:
         logging.info('generating word corpus, processing file %s',
                      ANT_NLP_FILE_PATH)
         for line in atec:
             line_code, s1, s2, label = line.strip().split("\t")
             s1 = utils.remove_punctuation(s1)
             s2 = utils.remove_punctuation(s2)
             yield list(jieba.cut(s1)) + list(jieba.cut(s2))
     for file in extract_wiki.list_all_files(PROCESSED_WIKI_FILE_PATH):
         logging.info('generating word corpus, processing file %s', file)
         with open(file, 'r', encoding="utf8") as wiki:
             for line in wiki:
                 line = utils.remove_punctuation(line)
                 if len(line) > 0:
                     # 汉字的unicode编码范围是[0x4E00,0x9FA5]
                     yield [
                         word for word in list(jieba.cut(line))
                         if word and 0x4E00 <= ord(word[0]) <= 0x9FA5
                     ]
Ejemplo n.º 27
0
 def tag_comments_database(self, comments):
     comments = utils.remove_punctuation(comments)
     print comments
     phrase_tag = set()
     phrase_list = comments.split(' ')
     for p in phrase_list:
         for t in self.tags_repo:
             match_part, if_same = wordvec.compare_phrase(p, t,self.model)
             if if_same:
                 phrase_tag.add(t)
     return phrase_tag
Ejemplo n.º 28
0
    def __iter__(self):
        with open(ANT_NLP_FILE_PATH, "r", encoding="utf8") as atec:
            logging.info('generating char corpus, processing file %s',
                         ANT_NLP_FILE_PATH)
            for line in atec:
                lineno, s1, s2, label = line.strip().split("\t")
                s1 = utils.remove_punctuation(s1)
                s2 = utils.remove_punctuation(s2)
                yield list(s1) + list(s2)

        for file in extract_wiki.list_all_files(PROCESSED_WIKI_FILE_PATH):
            logging.info('generating char corpus, processing file %s', file)
            with open(file, 'r', encoding="utf8") as wiki:
                for line in wiki:
                    line = utils.remove_punctuation(line)
                    if len(line) > 0:
                        yield [
                            char for char in line
                            if char and 0x4E00 <= ord(char[0]) <= 0x9FA5
                        ]
Ejemplo n.º 29
0
def train_word_vector(source,dict,wordvec):
    utils.jieba_add_dict(dict)
    comments_df = DataFrame.from_csv(source,sep = '\t')
    document = []
    for line in comments_df['comment'].values:
        line =  utils.remove_punctuation(line)
        cutted_line = jieba.cut(line)
        document.append(list(cutted_line))
    model = gensim.models.Word2Vec(document)
    print 'saving word vector model'
    model.save(wordvec)
    return model
Ejemplo n.º 30
0
def train_word_vector(source, dict, wordvec):
    utils.jieba_add_dict(dict)
    comments_df = DataFrame.from_csv(source, sep='\t')
    document = []
    for line in comments_df['comment'].values:
        line = utils.remove_punctuation(line)
        cutted_line = jieba.cut(line)
        document.append(list(cutted_line))
    model = gensim.models.Word2Vec(document)
    print 'saving word vector model'
    model.save(wordvec)
    return model
Ejemplo n.º 31
0
def clean(text):
  text = remove_accents(text)
  text = expand_contractions(text)
  text = handle_units(text)
  text = convert_word_to_number(text)
  text = remove_punctuation(text)
  doc = nlp(text)
  text = perform_spell_check(doc)
  doc = nlp(text)
  text = convert_plural_to_singular(doc)

  return text
Ejemplo n.º 32
0
def insertCode(email, upload_at):

    user_id = selectUserID(email)

    if user_id == '':

        data = {'email': '0', 'code': '0'}
        final = final = json.dumps(data, ensure_ascii=False).encode('utf8')
        return final
    else:

        em = selectUserID(email)

        code = selectCode(em)

        if code == '':

            code = utils.createCode()
            query = "INSERT INTO code_user (user_id, code, upload_at) VALUES (%s,%s,%s) ;"
            data = (em, code, upload_at)
            conn.run_query(query, data)
            data = {'email': email, 'code': code}
            final = final = json.dumps(data, ensure_ascii=False).encode('utf8')
            mservice.sendEmail(
                "*****@*****.**", "Qchw-2017", email,
                str(code) + " is your Quechua ASR verification code")
            return final
        else:

            code_old = selectCode(em)
            data = {
                'email': email,
                'code': utils.remove_punctuation(str(code_old))
            }
            final = final = json.dumps(data, ensure_ascii=False).encode('utf8')
            mservice.sendEmail(
                "*****@*****.**", "Qchw-2017", email,
                utils.remove_punctuation(str(code_old)) +
                " is your Quechua ASR verification code")
            return final
def main(args):
    """
    This processes the output of fairseq-generate so that it can be scored with sacrebleu and 
    so that it has the shared task format. 
    """
    cands = []
    seen_cands = set()
    current_source = None
    for line in args.infile:
        tokens = line.strip().split("\t")
        if line.startswith("S-"):
            # it's hard to have fairseq pass prompt ids through the training/evaluation process
            # so we resort to regenerating ids based on the prompt text.
            # we have to be careful that the text is *exactly* the same, or the id generation will be wrong.
            current_source = debpe.clean(
                tokens[1]) if not args.no_clean else tokens[1]
            textID = makeID(current_source)
            print(f"\n{textID}{FIELDSEP}{current_source}", file=args.outfile)
            cands = []
            seen_cands.clear()
        elif line.startswith("T-"):
            pass
        elif line.startswith("H-") and len(tokens) == 3 and not '-inf' in line:
            score = float(tokens[1])
            if len(cands) == 0:
                top_score = score
                if args.threshold != 0.0:
                    prompt_threshold = (-1.0 * args.threshold) + top_score
            # this is the prediction, there may be many of these.
            if ((args.candlimit == -1 or len(cands) < args.candlimit) and \
                (args.threshold == 0.0 or score > prompt_threshold)):

                hyp = debpe.clean(
                    tokens[2]) if not args.no_clean else tokens[2]
                hyp = hyp.lower()

                # remove language code if present
                if hyp.startswith("<") and len(hyp) >= 4 and hyp[3] == '>':
                    hyp = hyp[5:]

                if args.remove_punctuation:
                    hyp = remove_punctuation(hyp)

                if not hyp in seen_cands:
                    print(hyp, file=args.outfile)

                    cands.append(hyp)
                    seen_cands.add(hyp)
Ejemplo n.º 34
0
def tag_comments(comment, keys):
    tags = set()
    comment = utils.remove_punctuation(comment)
    comment = comment.strip(' ')
    split_comment = comment.split(' ')
    ff = open('../data/tag_comments', 'w')
    for phrase in split_comment:
        phrase = phrase.strip(' ')
        for key in keys:
            if phrase.find(key) > -1:
                if len(phrase) < 7 and len(phrase) > 2:
                    #len(phrase) 返回的是中文字的个数,不是真实长度(中文字×3)
                    comment = comment.replace(phrase, "$%s$" % (phrase))
                    tags.add(phrase)
        result = "%s >> %s" % (comment.rstrip(), "\t".join(list(tags)))
    return result
Ejemplo n.º 35
0
def tag_comments(comment, keys):
    tags = set()
    comment = utils.remove_punctuation(comment)
    comment = comment.strip(" ")
    split_comment = comment.split(" ")
    ff = open("../data/tag_comments", "w")
    for phrase in split_comment:
        phrase = phrase.strip(" ")
        for key in keys:
            if phrase.find(key) > -1:
                if len(phrase) < 7 and len(phrase) > 2:
                    # len(phrase) 返回的是中文字的个数,不是真实长度(中文字×3)
                    comment = comment.replace(phrase, "$%s$" % (phrase))
                    tags.add(phrase)
        result = "%s >> %s" % (comment.rstrip(), "\t".join(list(tags)))
    return result
Ejemplo n.º 36
0
 def test_remove_punctuation(self):
     tester = "-,.:"
     tester = utils.remove_punctuation(tester)
     self.assertTrue(tester == "    ")