def __init__(self):
     TextUtils.__init__(self)
     self.firsts = {}
     self.lasts = {}
     self.unigram = {}
     self.bigram = {}
     self.trigram = {}
Beispiel #2
0
	def __init__(self):
		TextUtils.__init__(self)
		self.firsts = {}
		self.lasts = {}
		self.unigram = {}
		self.bigram = {}
		self.trigram = {}
Beispiel #3
0
def preprocess_data(input_list):
    insts = []
    for line in input_list:
        line = TextUtils.remove_blank(line)
        token_seq = TextUtils.tokenize(line)
        insts.append(Instance(token_seq, tag=-1))
    return insts
Beispiel #4
0
# https://www.gairuo.com/file/data/dataset/GDP-China.csv

df = pd.DataFrame({
    "id": [1, 2],
    "name": ["Kevin", "Jenny"],
    "address": [{
        "hometown": "Meizhou",
        "work": "Guangzhou"
    }, {
        "hometown": "Hangzhou",
        "work": "Guangzhou"
    }],
    "contact": [{
        "mobile": ["+86 16888", "+86 168888"],
        "mail": "*****@*****.**"
    }, {
        "mobile": ["+86 16666", "+86 166666"],
        "mail": "*****@*****.**"
    }],
    "magic": [Decimal('000001.10000010'),
              Decimal('000002.20000020')]
})

df['contact'] = df['contact'].apply(lambda c: TextUtils.flatten_dict(
    c, formatter=PrimitiveKVFormatter(), array_index_start=1))

with pd.option_context("expand_frame_repr", False, "display.max_rows", None,
                       "display.max_colwidth", None):
    print(df)
    sys.exit(1)

text_font_size = lpt.get_mfs()
all_sizes = lpt.get_font_sizes()
heading_size = all_sizes[all_sizes.index(text_font_size)+1]

paper = etree.Element("paper")

curSection = None
mainBodyStarted = False

for c in lpt.chunks:
    chunk_font_size = c.get_mf_attr("font-size")
    chunk_font_style = c.get_mf_attr("font-style")
    chunkText = " ".join([i.text for i in c.words])
    chunkText = TextUtils.fix_wide_letters(TextUtils.remove_hyphens(chunkText))
    
#        print " ".join([i.text for i in c.words])
    if chunk_font_size >= text_font_size and chunk_font_size <= heading_size and chunk_font_style == FontStyle.Bold:
        if not mainBodyStarted and re.match("abstract", chunkText, flags = re.IGNORECASE):
            mainBodyStarted = True
        if not mainBodyStarted: continue
        curSection = etree.SubElement(paper, "section")

        m = re.match("^((?:\d\.)*\d)\s+(.*)$", chunkText)
        if m:
            curSection.attrib["number"] = m.group(1)
            chunkText = m.group(2)

        curSection.attrib["name"] = chunkText
        curSection.text = ""