Beispiel #1
0
# -*- coding: UTF-8 -*-
import subtext as sub
import procs.contents as conts
import subprocess
import random

CORPUS = '/home/eduardo/Trabajo/wikidumps/dumps/eswiktionary-20151226-pages-articles-multistream.xml'
INDEX_LINES = '/home/eduardo/Trabajo/wikidumps/dumps/wiktionary-line-index.txt'

ContsProc = conts.Contents()
lines_index = sub.get_lines(ContsProc,INDEX_LINES,CORPUS)

command = "rm samples.txt"
subprocess.Popen(command,stdout=subprocess.PIPE,shell=True)

for i in range(200) :
    index = lines_index[i]
    #index = random.choice(lines_index)
    command = "sed -n '" + str(index[0]) + "," + str(index[1]) + "p' " + CORPUS + " >> samples.txt"
    process = subprocess.Popen(command,stdout=subprocess.PIPE,shell=True)
    process.communicate()
    
    separator = 'echo "##########################################################################" >> samples.txt'
    process = subprocess.Popen(separator,stdout=subprocess.PIPE,shell=True)
    process.communicate()
    

#for i in lines_index :
    #if i[2] == u"niño" :
	#j = i