Ejemplo n.º 1
0
def sentences(a, b):
    """Return sentences in both a and b"""
    sen_a = st(a)
    sen_b = st(b)
    ls = []
    for sen in sen_a:
        if sen in sen_b and not sen in ls:
            ls.append(sen)

    return ls
def process_unlabel_lapt_for_bilstmcrf(input_fn, output_fn):
    if os.path.exists(output_fn):
        print('data already exists', output_fn)
        return
    res = []
    for f in os.listdir(input_fn):
        if not f.endswith('json'):
            continue
        f = open(input_fn + '/' + f)
        js = json.load(f)
        f.close()
        reviews = js['Reviews']
        contents = [r['Content'] for r in reviews if r['Content'] is not None]
        res.extend(contents)
    
    with open(output_fn, 'w') as f:
        for content in res:
            content = content.strip().lower()
            sents = st(content)
            for sent in sents:
                tokens = wt(sent)
                for token in tokens:
                    f.write(token.encode('utf-8'))
                    f.write(' O\n')
                f.write('\n')
def parse_sentences_from_verses(verseList):
	"""
	Open a text file and parse its sentenes.
	"""
	allSents = []
	for v in verseList:
		currSents = st(v)
		for s in currSents:
			allSents.append(s)

	return allSents
Ejemplo n.º 4
0
def parse_sentences_from_verses(verseList):
    """
	Open a text file and parse its sentenes.
	"""
    allSents = []
    for v in verseList:
        currSents = st(v)
        for s in currSents:
            allSents.append(s)

    return allSents
Ejemplo n.º 5
0
    def sent_tokenize(self, text):
        text = text.translate(Tokenizer.PUNC_TABLE)
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'({[^{}]*?)(\?)([^{}]*?})', r'\1__?__\3', text)
        text = re.sub(r'(\[[^\[\]]*?)(\?)([^\[\]]*?\])', r'\1__?__\3', text)
        text = re.sub(r'(\([^()]*?)(\?)([^()]*?\))', r'\1__?__\3', text)
        text = re.sub(r'(\<[^<>]*?)(\?)([^<>]*?\>)', r'\1__?__\3', text)
        text = re.sub(r'("[^"]*?)(\?)([^"]*?")', r'\1__?__\3', text)

        text = re.sub(r'({[^{}]*?)(!)([^{}]*?})', r'\1__!__\3', text)
        text = re.sub(r'(\[[^\[\]]*?)(!)([^\[\]]*?\])', r'\1__!__\3', text)
        text = re.sub(r'(\([^()]*?)(!)([^()]*?\))', r'\1__!__\3', text)
        text = re.sub(r'(\<[^<>]*?)(!)([^<>]*?\>)', r'\1__!__\3', text)
        text = re.sub(r'("[^"]*?)(!)([^"]*?")', r'\1__!__\3', text)

        text = re.sub(r'({[^{}]*?)(\.)([^{}]*?})', r'\1__.__\3', text)
        text = re.sub(r'(\[[^\[\]]*?)(\.)([^\[\]]*?\])', r'\1__.__\3', text)
        text = re.sub(r'(\([^()]*?)(\.)([^()]*?\))', r'\1__.__\3', text)
        text = re.sub(r'(\<[^<>]*?)(\.)([^<>]*?\>)', r'\1__.__\3', text)
        text = re.sub(r'("[^"]*?)(\.)([^"]*?")', r'\1__.__\3', text)

        text = text.replace("e.g.", "__eg__")
        text = text.replace("E.g.", "__eg__")
        text = text.replace("E.G.", "__eg__")
        text = text.replace("i.e.", "__ie__")
        text = text.replace("I.e.", "__ie__")
        text = text.replace("I.E.", "__ie__")
        sentences = []
        for sent in st(text):
            if self.__pre_check(sent):
                sent_text = sent.replace("__eg__", "e.g.").replace(
                    "__ie__",
                    "i.e.").replace("__?__",
                                    "?").replace("__!__",
                                                 "!").replace("__.__", ".")
                sent_text = re.sub(r'^(-CODE- |-TAB- |-IMG- |-URL- )(.*)',
                                   r'\2', sent_text)
                sent_text = re.sub(r'^(\()(.*)(\))$', r'\2', sent_text)
                sent_text = re.sub(r'^(\[)(.*)(\])$', r'\2', sent_text)
                sent_text = re.sub(r'^({)(.*)(})$', r'\2', sent_text)
                words = sent_text.split()
                if re.search(r'^[^A-Z]',
                             words[0]) is not None and words[1] in {
                                 "A", "An", "The", "This", "That", "You", "We"
                             } and re.search(r'^[^A-Z]', words[2]) is None:
                    sent_text = " ".join(words[1:])
                sent_text = sent_text.strip()
                if self.__post_check(sent_text):
                    sentences.append(Sentence(sent_text))
        # text = re.sub(r'\n(.+?[^.?!])\n([A-Z])', r'\n\n\2', text)
        # text = re.sub(r'\s+', " ", text.strip())
        # text = re.sub(r'([?!.]+) ', r'\1\n', text)
        # sentences = set(text.split("\n"))
        return sentences
Ejemplo n.º 6
0
def return_sentence_of_word(word_original):
    content = open(file_name, 'r').read()

    final_content_sent = []

    for sentence in st(content):
        try:
            if sentence.index(word_original) != -1:
                final_content_sent.append(sentence)
            print()
        except Exception as ex:
            pass
    return final_content_sent
Ejemplo n.º 7
0
	def simHelper(T):
		'''
		Given a token returns a pos tagged list 
		'''
		alphanum = letters+octdigits

		# part of speech word list for the text
		fullList = [word for subl in [pos_tag(wt(s)) for s in st(T)] for word in subl]

		# remove symbols and -NONE- tags from list by checking the first character of the word and tag
		posList = [word for word in fullList if word[1][0] in alphanum and word[0][0] in alphanum]

		return posList
def process_unlabel_rest_for_bilstmcrf(input_fn, output_fn):
    if os.path.exists(output_fn):
        print('data already exists', output_fn)
        return
    input_file = open(input_fn, 'r')
    output_file = open(output_fn, 'w')
    lines = input_file.readlines()
    lines = [w  for s in lines for w in s.split('\t')[1].lower().strip().split('\\n') if len(w) != 0]
    for line in lines:
        sents = st(line)
        for sent in sents:
            tokens = wt(sent)
            for token in tokens:
                output_file.write(token)
                output_file.write(' ')
                output_file.write('O')
                output_file.write('\n')
            output_file.write('\n')
def createCorpus(t):
    corpus = []
    all_sent = []
    for k in t:
        for p in t[k]:
            corpus.append(st(p))
    for sent in range(len(corpus)):
        for k in corpus[sent]:
            all_sent.append(k)
    for m in range(len(all_sent)):
        all_sent[m] = wt(all_sent[m])
    
    all_words=[]
    for sent in all_sent:
        hold=[]
        for word in sent:
            hold.append(word.lower())
        all_words.append(hold)
    return all_words
Ejemplo n.º 10
0
def _filter_sent_tokenize(corpus):
    ### 3.1.1
    # tokenize into sentences
    from nltk.tokenize import sent_tokenize as st

    original_sentences = st(corpus)

    # filter sentences
    sentences = [
        sentence for sentence in original_sentences if len(sentence) > 5
    ]

    # show sentences
    #     n_sent = len(sentences)
    #     for i in range(3):
    #         print(sentences[i])
    #     print("number of sentences:", n_sent)

    return sentences
Ejemplo n.º 11
0
	def create_corpus(self):
		corpus = []
		all_sentences = []
		for k in self.data:
			for p in self.data[k]:
				corpus.append(st(p))
		for sent in range(len(corpus)):
			for k in corpus[sent]:
				all_sentences.append(k)
		for m in range(len(all_sentences)):
			all_sentences[m] = wt(all_sentences[m])

		all_words=[]
		for sent in all_sentences:
			hold=[]
			for word in sent:
				hold.append(word.lower())
			all_words.append(hold)
		return all_words
Ejemplo n.º 12
0
	def summarize(self, text, n):

		sents = st(text)
		assert n <= len(sents)
		# assert is a way of making sure a condition holds true
		# will throw error if it is false

		word_sent = [wt(s.lower()) for s in sents]
		# list of lists of all the sentences 
		self._freq = self._compute_frequencies(word_sent)
		ranking = defaultdict(int)
		for i,sent in enumerate(word_sent):
			# enumerate creates a tuple with index,element for each entry in the list
			# allows need for a counter variable, but makes it easy to index
			for word in sent:
				if word in self._freq:
					ranking[i] += self._freq[word]

		sents_idx = nlargest(n,ranking, key = ranking.get)
		return [sents[j] for j in sents_idx] # sexy list comprehension
Ejemplo n.º 13
0
def pre_process():
    filenames = os.listdir(raw_text_path)

    for f in filenames:
        with open(os.path.join(raw_text_path, f), 'r') as doc:
            texts = re.split('</?doc.*>\n+', doc.read())

            for page in texts:
                if page == '':
                    continue
                paragraphs = re.split('\n+', page)
                title = paragraphs[0]
                doc_path = os.path.join(output_path, title)

                if not os.path.exists(doc_path):
                    os.makedirs(doc_path)

                print "processing %s" % title

                for i in range(1, len(paragraphs)):
                    out = os.path.join(doc_path, "%d.txt" % i)
                    if os.path.exists(out):
                        continue
                    writer = codecs.open(out, 'w', 'utf-8')
                    sentences = st(paragraphs[i].decode('utf-8'))

                    for s in sentences:
                        tags = ner.tag(wt(s))

                        for t in tags:
                            writer.write('%s\t%s\n' % (t[0], t[1]))

                        writer.write('\n')

                    writer.flush()
                    writer.close()
Ejemplo n.º 14
0
f = open('speckled.js', 'rb')
s = f.read().decode('latin1')
from nltk.tokenize import sent_tokenize as st
for x in st(s):
    print(x)
Ejemplo n.º 15
0
		# average paragraph size
		wst = WhitespaceTokenizer()
		paraWordCounts = [len(wst.tokenize(p)) for p in paragraphs]

		# the approximate number of words in the document
		numWords = sum(paraWordCounts)

		# the average number of words per paragraph
		avgParagraphLen = mean(paraWordCounts)

		# rejoin the paragraphs
		text = ' '.join(paragraphs)

 		# part of speech word list for the text
 		text = [word for subl in [pos_tag(wt(s)) for s in st(text)] for word in subl]

 		# remove symbols from list by checking the first character of the word
 		text = [word for word in text if word[0][0] in alphanum]

 		# convert words to lowercase and convert Penn Tree Bank tags to WordNet tags
 		text = [(word[0].lower(), convertTag(word[1])) for word in text]

 		# remove Nones
 		text = [word for word in text if word[1]]

 		nouns = [word for word in text if word[1] == 'n']
 		numNouns = len(nouns)

 		verbs = [word for word in text if word[1] == 'v']
 		numVerbs = len(verbs)
Ejemplo n.º 16
0
# Use NLTK corpus which we seen in chapter 2 as well
def readcorpus():
    raw_content_cg = cg.raw("burgess-busterbrown.txt")
    # print raw_content_cg[0:1000]
    return raw_content_cg[0:1000]


if __name__ == "__main__":
    print ""
    print "----------Output from Raw Text file-----------"
    print ""
    filecontentdetails = fileread()
    print filecontentdetails
    # sentence tokenizer
    st_list_rawfile = st(filecontentdetails)
    print len(st_list_rawfile)

    print ""
    print "-------Output from assigned variable-------"
    print ""
    localveriabledata = localtextvalue()
    print localveriabledata
    # sentence tokenizer
    st_list_local = st(localveriabledata)
    print len(st_list_local)
    print st_list_local

    print ""
    print "-------Output Corpus data--------------"
    print ""
Ejemplo n.º 17
0
def __extractSynSets(T):
	'''
	Given a text T (as a string) find all words that have WordNet synsets
	@return a unique list of SynSet objects
	'''

	'''
	CONSTANTS
	'''
	nounTags = ['NN','NNP','NNS','NNPS']
	verbTags = ['VB','VBD','VBG','VBN','VBP','VBZ']
	adjTags = ['JJ','JJR','JJS']
	advTags = ['RB','RBR','RBS']
	alphanum = letters+octdigits

	
	def convertTag(tag):
		'''
		Converts a Penn Tree Bank POS tag to a WordNet
		@return the converted tag otherwise None
		'''
		if tag in nounTags:
			return 'n'
		elif tag in verbTags:
			return 'v'
		elif tag in adjTags:
			return 'as' # adjectives in WordNet can be head adj 'a' or satellite adj 's'
		elif tag in advTags:
			return 'r'
		else:
			return None 
	
	def getSynSet(w):
		'''
		For a word 'w' with POS tag 'tag' find the corresponding WordNet synset
		@return the best matching sysnset for 'w' otherwise None
		'''
		tag = w[1]
		word = w[0]

		# get the list of possible synsets for w
		sets = wn.synsets(word)
		
		if not tag or sets == []:
			return None

		# look through the list of possible synsets for the first one w/ a pos tag that matches 'tag'
		for s in sets:
			if s.pos in tag:
				return s

		return None

	# part of speech word list for the text
	fullList = [word for subl in [pos_tag(wt(s)) for s in st(T)] for word in subl]

	# remove symbols and -NONE- tags from list by checking the first character of the word and tag
	posList = [word for word in fullList if word[1][0] in alphanum and word[0][0] in alphanum]

	# convert words to lowercase and convert Penn Tree Bank tags to WordNet tags
	posList = [(word[0].lower(), convertTag(word[1])) for word in posList]

	# remove words for which there is no WordNet tag (i.e. tag is None) and remove duplicate values
	posList = list(set([word for word in posList if word[1]]))

	# for the words in the POS list create a list of syn sets using their tags (remove None values)
	synSets = [n for n in [getSynSet(w) for w in posList] if n] 

	return synSets
Ejemplo n.º 18
0
from numba import vectorize

# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

text = """
Outlier Detection is the process of finding data objects with behaviors that are very different from expectation. Such objects are called outliers or anomalies. An outlier is a data object that deviates significantly from the rest of the objects, as if it were generated by a different mechanism. In a given data set, a data object is a global outlier if it deviates significantly from the rest of the data set. Global outliers are sometimes called point anomalies, and are the simplest type of outliers. Most outlier detection methods are aimed at finding global outliers. In a given data set, a data object is a contextual outlier if it deviates significantly with respect to a specific context of the object. Contextual outliers are also known as conditional outliers because they are conditional on the selected context. Given a data set, a subset of data objects forms a collective outlier if the objects as a whole deviate significantly from the entire data set. Importantly, the individual data objects may not be outliers. Outlier detection quality highly depends on the modeling of normal (non outlier) objects and outliers. Often, building a comprehensive model for data normality is very challenging, if not impossible. This is partly because it is hard to enumerate all possible normal behaviors in an application. The border between data normality and abnormality (outliers) is often not clear cut. Instead, there can be a wide range of gray area. Consequently, while some outlier detection methods assign to each object in the input data set a label of either “normal” or “outlier,” other methods assign to each object a score measuring the “outlier-ness” of the object. Technically, choosing the similarity/distance measure and the relationship model to describe data objects is critical in outlier detection. Unfortunately, such choices are often application-dependent. Different applications may have very different requirements. For example, in clinic data analysis, a small deviation may be important enough to justify an outlier. In contrast, in marketing analysis, objects are often subject to larger fluctuations, and consequently a substantially larger deviation is needed to justify an outlier. Outlier detection’s high dependency on the application type makes it impossible to develop a universally applicable outlier detection method. Instead, individual outlier detection methods that are dedicated to specific applications must be developed. As mentioned earlier, outliers are different from noise. It is also well known that the quality of real data sets tends to be poor. Noise often unavoidably exists in data collected in many applications. Noise may be present as deviations in attribute values or even as missing values. Low data quality and the presence of noise bring a huge challenge to outlier detection. They can distort the data, blurring the distinction between normal objects and outliers. Moreover, noise and missing data may “hide” outliers and reduce the effectiveness of outlier detection—an outlier may appear “disguised” as a noise point, and an outlier detection method may mistakenly identify a noise point as an outlier. In some application scenarios, a user may want to not only detect outliers, but also understand why the detected objects are outliers. To meet the understandability requirement, an outlier detection method has to provide some justification of the detection. For example, a statistical method can be used to justify the degree to which an object may be an outlier based on the likelihood that the object was generated by the same mechanism that generated the majority of the data. The smaller the likelihood, the more unlikely the object was generated by the same mechanism, and the more likely the object is an outlier. Supervised methods model data normality and abnormality. Domain experts examine and label a sample of the underlying data. Outlier detection can then be modeled as a classification problem. The task is to learn a classifier that can recognize outliers. The sample is used for training and testing. In some applications, the experts may label just the normal objects, and any other objects not matching the model of normal objects are reported as outliers. Other methods model the outliers and treat objects not matching the model of outliers as normal. Supervised methods of outlier detection must be careful in how they train and how they interpret classification rates due to the fact that outliers are rare in comparison to the other data samples. In some application scenarios, objects labeled as “normal” or “outlier” are not available. Thus, an unsupervised learning method has to be used. Unsupervised outlier detection methods make an implicit assumption: The normal objects are somewhat “clustered.” In other words, an unsupervised outlier detection method expects that normal objects follow a pattern far more frequently than outliers. Normal objects do not have to fall into one group sharing high similarity. Instead, they can form multiple groups, where each group has distinct features. However, an outlier is expected to occur far away in feature space from any of those groups of normal objects. This assumption may not be true all the time. In some applications, normal objects are diversely distributed, and many such objects do not follow strong patterns. In such scenarios, unsupervised methods may have a high false positive rate—they may mislabel many normal objects as outliers, and let many actual outliers go undetected. Due to the high similarity between such normal objects and outliers, modeling outliers using supervised methods may be far more effective. In many applications, although obtaining some labeled examples is feasible, the number of such labeled examples is often small. We may encounter cases where only a small set of the normal and/or outlier objects are labeled, but most of the data are unlabeled. Semi-supervised outlier detection methods were developed to tackle such scenarios. Semi-supervised outlier detection methods can be regarded as applications of semi-supervised learning methods. For example, when some labeled normal objects are available, we can use them, together with unlabeled objects that are close by, to train a model for normal objects. The model of normal objects then can be used to detect outliers—those objects not fitting the model of normal objects are classified as outliers. If only some labeled outliers are available, semi-supervised outlier detection is trickier. A small number of labeled outliers are unlikely to represent all the possible outliers. Therefore, building a model for outliers based on only a few labeled outliers is unlikely to be effective. To improve the quality of outlier detection, we can get help from models for normal objects learned from unsupervised methods. Proximity-based methods assume that an object is an outlier if the nearest neighbors of the object are far away in feature space, that is, the proximity of the object to its neighbors significantly deviates from the proximity of most of the other objects to their neighbors in the same data set. The effectiveness of proximity-based methods relies heavily on the proximity (or distance) measure used. In some applications, such measures cannot be easily obtained. Moreover, proximity-based methods often have difficulty in detecting a group of outliers if the outliers are close to one another. There are two major types of proximity-based outlier detection, namely distance based and density-based outlier detection. A distance-based outlier detection method consults the neighborhood of an object, which is defined by a given radius. An object is then considered an outlier if its neighborhood does not have enough other points. A density-based outlier detection method investigates the density of an object and that of its neighbors. Here, an object is identified as an outlier if its density is relatively much lower than that of its neighbors. Clustering-based methods assume that the normal data objects belong to large and dense clusters, whereas outliers belong to small or sparse clusters, or do not belong to any clusters. Clustering is an expensive data mining operation. A straightforward adaptation of a clustering method for outlier detection can be very costly, and thus does not scale up well for large data sets. The notion of outliers is highly related to that of clusters. Clustering-based approaches detect outliers by examining the relationship between objects and clusters. Intuitively, an outlier is an object that belongs to a small and remote cluster, or does not belong to any cluster.
"""

# @vectorize()
# def x():
  sentences = []
  for s in text:
    sentences.append(st(s))

  sentences = [y for x in sentences for y in x] # flatten list

  # Extract word vectors
  word_embeddings = {}
  f = open('../../glove.6B.100d.txt', encoding='utf-8')
  for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      word_embeddings[word] = coefs
  f.close()

  print(len(word_embeddings))
  # remove punctuations, numbers and special characters
Ejemplo n.º 19
0
def num_sen(text):
    """How long is the post."""
    return len(st(text))
Ejemplo n.º 20
0
		then the atom is electrically neutral. If an atom has more or fewer electrons than protons, 
		then it has an overall negative or positive charge, respectively – such atoms are called ions.

		The electrons of an atom are attracted to the protons in an atomic nucleus by the electromagnetic force. 
		The protons and neutrons in the nucleus are attracted to each other by the nuclear force. 
		This force is usually stronger than the electromagnetic force that repels the positively 
		charged protons from one another. Under certain circumstances, the repelling electromagnetic 
		force becomes stronger than the nuclear force. In this case, the nucleus splits and leaves 
		behind different elements. This is a form of nuclear decay.'''

#clearning the texts
import re

ps=ps() #object creation porter stemmer
wl=wl() #object creation word net lemmatizer
sentences=st(para) #tokenizing to sentences
corpus=[]

for i in range(len(sentences)):
	rev=re.sub('[^a-zA-Z]',' ',sentences[i]) #everything other than alphabets would be replaced by space
	rev=rev.lower() #lowers the letters in the sentences
	rev=rev.split() #splits them word wise into elements of a list
	rev=[wl.lemmatize(word) for word in rev if word not in set(stopwords.words('english'))]
	rev=' '.join(rev)
	corpus.append(rev) #appending to list
	
#bag of words
from sklearn.feature_extraction.text import CountVectorizer #importing countervectorizer
cv=CountVectorizer()
x=cv.fit_transform(corpus).toarray() #transforming it to an array
Ejemplo n.º 21
0
    for l in f:
        l = l.split('\t')
        try:
            did = int(l[0])
        except:
            continue
        title = l[1]
        doc = l[-1]
        sys.stdout.write('\rHandling doc id: ' + str(did))
        sys.stdout.flush() # important

        if did in qids:
            #do sentence boundary detection
            if '\\n' in doc:
                doc = '\\n'.join(doc.split('\\n')[1:])
            doc = st(doc)[0]

            if '\\n' in doc:
                 doc = doc.split('\\n')[0]

            doc = doc.lower()

            title_tokens = title.split()
            if len(title_tokens) > 1:
                doc = doc.replace(title.lower(), '')

            title_tokens = [stemmer.stem(t.lower()) for t in title_tokens if t not in stopwords]

            keep = []
            for t in doc.split():
Ejemplo n.º 22
0
		The electrons of an atom are attracted to the protons in an atomic nucleus by the electromagnetic force. 
		The protons and neutrons in the nucleus are attracted to each other by the nuclear force. 
		This force is usually stronger than the electromagnetic force that repels the positively 
		charged protons from one another. Under certain circumstances, the repelling electromagnetic 
		force becomes stronger than the nuclear force. In this case, the nucleus splits and leaves 
		behind different elements. This is a form of nuclear decay.'''

#dependencies
import re  #regular expression
from nltk.tokenize import sent_tokenize as st, word_tokenize as wt  #for tokenization
from nltk.corpus import stopwords  #stop words
from nltk.stem import WordNetLemmatizer as wl  #for lemmatization

wordnet = wl()  #object creation for lemmatization
corpus = []  #empty list
sentences = st(para)  #tokenizing the paragraph to sentences

for i in range(len(sentences)):
    rev = re.sub(
        '[^a-zA-Z]', ' ',
        sentences[i])  #replace all the letters by space except the alphabets
    rev = rev.lower()  #lower the senteces
    rev = rev.split()  #each word gets converted to an element of a list
    rev = [
        wordnet.lemmatize(word) for word in rev
        if word not in stopwords.words('english')
    ]
    rev = ' '.join(rev)
    corpus.append(rev)

#creating TF-IDF model