Esempio n. 1
0
 def __init__(self, annotation_file_path, train_image_folder_path):
     self.annotation_file = annotation_file_path
     self.PATH = train_image_folder_path
     train_captions, img_path_vector = deal_annotations(
         self.annotation_file, self.PATH)
     image_preprocess(img_path_vector)
     caption_vector, self.max_length, self.tokenizer = text_preprocess(
         train_captions, vocab_size=settings.vocab_size)
     dataset = build_dataset(img_path_vector, caption_vector)
     self.dataset = dataset
     self.total_num = len(img_path_vector)
import numpy as np
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

this_file_path = os.path.abspath(__file__)
folder_root = os.path.split(this_file_path)[0]
repo_root = os.path.split(folder_root)[0]

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import preprocess as pr

path_corpus = os.path.join(repo_root, "orig_text_data/NI_docs/")

## Load txt document file names
ocr_text = pr.text_preprocess(path_corpus)
ocr_text.files  #6946

# run through function to gather all text (as dictionary)
ocr_text_corpus = ocr_text.nvivo_ocr()

# Convert to Dataframe
ocr_corpus = pd.DataFrame(ocr_text_corpus.items())
ocr_corpus.columns = ['img_file', 'raw_text']

# Subset to pages that contain a justification
df = pd.read_csv(os.path.join(repo_root, 'justifications_clean_text_ohe.csv'))
just_imgs = np.ndarray.tolist(df['img_file_orig'].unique())
ocr_corpus_subset = ocr_corpus.loc[ocr_corpus['img_file'].isin(just_imgs)]

# Define whether you want to use whole corpus or subset to text w justifications
Esempio n. 3
0
folder_path = os.path.join(folder_root) + '/'
repo_root = os.path.split(folder_root)[0]
j_path = os.path.join(repo_root, 'orig_text_data/just_0404') + '/'
#j_path = os.path.join(repo_root, 'orig_text_data/just_icr') + '/' #For ICR task

sys.path  # make sure the repo is in the path

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import preprocess as pr

################################################
##### 1) COMPILE JUSTIFICATIONS FROM NVIVO #####
################################################

## Load txt document
raw = pr.text_preprocess(j_path)
raw.files

cat_text = raw.nvivo_clumps()

for key, value in cat_text.items():
    print(key)
# Create df this will create a df with lists where each row is the justification
# category (12) followed by a list of the individual "files" within the category
temp_df = pd.DataFrame.from_dict(cat_text, orient='index')

# What we want is a column for the category repeated by each entry of the list
# which will becore rows
df_long = temp_df.stack().reset_index(level=1,
                                      drop=True).to_frame(name='raw_text')
'''
Main function: 
1. Reads Json file, Processes it and converts to dataframe.
2. Perform Text cleaning ( remove spaces, punctuations(etc), short words, lower-casing) 
3. Does Text Preparation ( stop-words(remove), stemming and lemmatization )
4. Apply Latent Sematic Analysis(LSA) for feature extraction
5. Classification task based on the features from LSA into 2 classes,
   Patent Granted and Patent Not-Granted. 
'''
if __name__ == "__main__":
    
    #convert json to dataframe
    df = json_to_df('uspto.json')
    
    #preprocess text from the summary column of the dataframe
    tp = text_preprocess(df)
    tp.text_cleaning()
    df = tp.prepare_text()
    
    #Latent Semantic Analysis for feature extraction
    lsa = feature_extraction(df)
    dtm, vectorizer = lsa.document_term_matrix()
    lsa.topic_model(dtm,vectorizer)
    
    #Train Model for classification
    X_train, X_test, y_train, y_test = train_test_split(dtm, df['Decision'].astype('int'), test_size=0.20, random_state=0)
    clf = classify(X_train, X_test, y_train, y_test)
    clf.make_classification()
    
    
        
import pandas as pd
import numpy as np

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import preprocess as pr

## Set project path 
this_file_path = os.path.abspath(__file__)
folder_root = os.path.split(this_file_path)[0]
repo_root = os.path.split(folder_root)[0]
all_path = os.path.join(repo_root, "orig_text_data/NI_docs/") 

#sys.path # make sure the repo is in the path 

## Load txt documents (all)
raw = pr.text_preprocess(all_path)
raw.files

## Randomly draw 10 documents
random_draw = np.random.choice(raw.files, 10)

## Loop through 10 documents to count characters (not including spaces)
words2={}
characters2={}
characters3=[]

for f in random_draw:
        # Extract text and parse into df
    with open(f) as text:
        words = 0
        characters = 0