Ejemplo n.º 1
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符

    temp = []  # 根据标点拆分词条, 取第一段
    for term in tqdm(termList):
        temp.append(re.split('[,,]', term, maxsplit=1)[0])
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Ejemplo n.º 2
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    termList = [term for term in termList
                if not isDescription(term)]  # 去掉叙述性的语句

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = [subword.strip() for subword in re.split('[,,]', term)]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = [
        term for term in termList
        if len(term) > 1 and len(term) < 20 and containCNS(term)
    ]  # 长度大于1 且 长度小于20 且 包含中文
    termList = [term for term in termList if re.match('^见[\w]+', term) is None]

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Ejemplo n.º 3
0
 def baikeScript():
     termList = uniqueList(
         open(folder + os.sep + 'umls_wikipedia.txt').read().splitlines() +
         open(folder + os.sep + 'umls_baidubaike.txt').read().splitlines())
     termList = sortedByLength(termList)
     print('size={}'.format(len(termList)))
     print('\n'.join(termList),
           file=open(folder + os.sep + 'umls_baike.txt', 'w'))
Ejemplo n.º 4
0
 def baiduBaikeScript():
     import logging
     logging.getLogger("requests").setLevel(logging.WARNING)
     logging.getLogger("urllib3").setLevel(logging.WARNING)
     termList = json.load(open(folder + os.sep + 'umls.json'))
     termList = sortedByLength(baiduBaikeFilter(termList))
     print('\n'.join(termList),
           file=open(folder + os.sep + 'umls_baidubaike.txt', 'w'))
Ejemplo n.º 5
0
 def script2():
     from segment.umls import baiduBaikeFilter, wikiPediaFilter
     termList = open(folder + os.sep +
                     'umls_bgequal.txt').read().splitlines()
     baiduTermList = baiduBaikeFilter(
         termList,
         open(folder + os.sep + 'umls_baidubaike.txt').read().splitlines())
     wikiTermList = wikiPediaFilter(termList)
     termList = sortedByLength(uniqueList(baiduTermList + wikiTermList))
     print('size: {}'.format(len(termList)))
     print('\n'.join(termList),
           file=open(folder + os.sep + 'umls_bgequal_baike.txt', 'w'))
Ejemplo n.º 6
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符
    print('size={}'.format(len(termList)))
    termList = [
        term for term in tqdm(termList) if len(term) > 1 and containCNS(term)
    ]  # 初筛: 长度大于1 且 包含中文
    print('size={}'.format(len(termList)))

    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = re.split('[,;;]', term)  # 按标点切分(考查了中英符号: 逗号,句号,冒号,分号)
        subList = [subword.strip() for subword in subList]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in tqdm(termList) if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in tqdm(termList) if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))
    termList = [
        term for term in tqdm(termList)
        if len(term) > 1 and len(term) < 20 and allCNS(term)
    ]  # 终筛: 长度大于1 且 长度小于20 且 全中文
    print('size={}'.format(len(termList)))

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)  # 按长度排序
Ejemplo n.º 7
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词
    termList = [
        term for term in termList
        if len(term) > 1 and len(term) < 20 and allCNS(term)
    ]  # 终筛: 长度大于1 且 长度小于20 且 仅包含中文

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Ejemplo n.º 8
0
def genSegmentTermList(termList):
	termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符

	termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符

	temp = []   # 根据标点拆分词条
	for term in tqdm(termList):
		subList = [subword.strip() for subword in re.split('[,,]', term)]
		temp.extend(subList)
	termList = uniqueList(temp)
	print('size={}'.format(len(termList)))

	termList = [term for term in termList if not containPunc(term)] # 去掉包含标点的词
	termList = [term for term in termList if not allDigit(term)]   # 去掉纯数字
	termList = [term for term in termList if not containUselessDigitTerm(term)] # 去掉包含时间词、温度词的词
	termList = [term for term in termList if not (len(term) == 1 and not isCNS(term))]  # 去掉非汉字单字

	termList = removeStopwords(termList)    # 去除停用词
	termList = uniqueList(termList)
	print('size={}'.format(len(termList)))
	return sortedByLength(termList)
Ejemplo n.º 9
0
def genSnomedctSegment(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符
    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = [subword.strip() for subword in re.split('[,,::]', term)]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词
    print('size={}'.format(len(termList)))

    termList = [
        term for term in termList if len(term) > 1 and containCNS(term)
    ]  # 长度大于1 且 包含中文
    print('size={}'.format(len(termList)))

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Ejemplo n.º 10
0
 def wikiPediaScript():
     termList = open(folder + os.sep + 'umls.txt').read().splitlines()
     termList = sortedByLength(wikiPediaFilter(termList))
     print('\n'.join(termList),
           file=open(folder + os.sep + 'umls_wikipedia.txt', 'w'))